def get_process_dirs(cls, worker_name: WorkerName, job_id=None, role=None, party_id=None, task: Task = None): worker_id = base_utils.new_unique_id() party_id = str(party_id) if task: config_dir = job_utils.get_job_directory(job_id, role, party_id, task.f_component_name, task.f_task_id, str(task.f_task_version), worker_name.value, worker_id) log_dir = job_utils.get_job_log_directory(job_id, role, party_id, task.f_component_name) elif job_id and role and party_id: config_dir = job_utils.get_job_directory(job_id, role, party_id, worker_name.value, worker_id) log_dir = job_utils.get_job_log_directory(job_id, role, party_id, worker_name.value, worker_id) else: config_dir = job_utils.get_general_worker_directory( worker_name.value, worker_id) log_dir = job_utils.get_general_worker_log_directory( worker_name.value, worker_id) os.makedirs(config_dir, exist_ok=True) return worker_id, config_dir, log_dir
def download_upload(access_module): job_id = job_utils.generate_job_id() if access_module == "upload" and UPLOAD_DATA_FROM_CLIENT and not (request.json and request.json.get("use_local_data") == 0): file = request.files['file'] filename = os.path.join(job_utils.get_job_directory(job_id), 'fate_upload_tmp', file.filename) os.makedirs(os.path.dirname(filename), exist_ok=True) try: file.save(filename) except Exception as e: shutil.rmtree(os.path.join(job_utils.get_job_directory(job_id), 'fate_upload_tmp')) raise e job_config = request.args.to_dict() if "namespace" in job_config and "table_name" in job_config: pass else: # higher than version 1.5.1, support eggroll run parameters job_config = json_loads(list(job_config.keys())[0]) job_config['file'] = filename else: job_config = request.json required_arguments = ['work_mode', 'namespace', 'table_name'] if access_module == 'upload': required_arguments.extend(['file', 'head', 'partition']) elif access_module == 'download': required_arguments.extend(['output_path']) else: raise Exception('can not support this operating: {}'.format(access_module)) detect_utils.check_config(job_config, required_arguments=required_arguments) data = {} # compatibility if "table_name" in job_config: job_config["name"] = job_config["table_name"] if "backend" not in job_config: job_config["backend"] = 0 for _ in ["work_mode", "backend", "head", "partition", "drop"]: if _ in job_config: job_config[_] = int(job_config[_]) if access_module == "upload": if job_config.get('drop', 0) == 1: job_config["destroy"] = True else: job_config["destroy"] = False data['table_name'] = job_config["table_name"] data['namespace'] = job_config["namespace"] data_table_meta = storage.StorageTableMeta(name=job_config["table_name"], namespace=job_config["namespace"]) if data_table_meta and not job_config["destroy"]: return get_json_result(retcode=100, retmsg='The data table already exists.' 'If you still want to continue uploading, please add the parameter -drop.' ' 0 means not to delete and continue uploading, ' '1 means to upload again after deleting the table') job_dsl, job_runtime_conf = gen_data_access_job_config(job_config, access_module) submit_result = DAGScheduler.submit({'job_dsl': job_dsl, 'job_runtime_conf': job_runtime_conf}, job_id=job_id) data.update(submit_result) return get_json_result(job_id=job_id, data=data)
def start_task(job_id, component_name, task_id, role, party_id, task_config): schedule_logger.info('job {} {} {} {} task subprocess is ready'.format( job_id, component_name, role, party_id, task_config)) task_process_start_status = False try: task_dir = os.path.join(job_utils.get_job_directory(job_id=job_id), role, party_id, component_name) os.makedirs(task_dir, exist_ok=True) task_config_path = os.path.join(task_dir, 'task_config.json') with open(task_config_path, 'w') as fw: json.dump(task_config, fw) process_cmd = [ 'python3', sys.modules[TaskExecutor.__module__].__file__, '-j', job_id, '-n', component_name, '-t', task_id, '-r', role, '-p', party_id, '-c', task_config_path ] task_log_dir = os.path.join( job_utils.get_job_log_directory(job_id=job_id), role, party_id, component_name) schedule_logger.info( 'job {} {} {} {} task subprocess start'.format( job_id, component_name, role, party_id, task_config)) p = job_utils.run_subprocess(config_dir=task_dir, process_cmd=process_cmd, log_dir=task_log_dir) if p: task_process_start_status = True except Exception as e: schedule_logger.exception(e) finally: schedule_logger.info( 'job {} component {} on {} {} start task subprocess {}'.format( job_id, component_name, role, party_id, 'success' if task_process_start_status else 'failed'))
def start_inheriting_job(cls, job): JobSaver.update_job( job_info={ "job_id": job.f_job_id, "role": job.f_role, "party_id": job.f_party_id, "inheritance_status": JobInheritanceStatus.RUNNING }) conf_dir = job_utils.get_job_directory(job_id=job.f_job_id) os.makedirs(conf_dir, exist_ok=True) process_cmd = [ sys.executable or 'python3', sys.modules[JobInherit.__module__].__file__, '--job_id', job.f_job_id, '--role', job.f_role, '--party_id', job.f_party_id, ] log_dir = os.path.join( job_utils.get_job_log_directory(job_id=job.f_job_id), "job_inheritance") p = process_utils.run_subprocess(job_id=job.f_job_id, config_dir=conf_dir, process_cmd=process_cmd, log_dir=log_dir, process_name="job_inheritance")
def upload_file_block(self, file_list, data_head, table_list): if data_head: self.update_table_meta(data_head) upload_process = [] for block_index, block_file in enumerate(file_list): task_dir = os.path.join( job_utils.get_job_directory(job_id=self.tracker.job_id), self.tracker.role, str(self.tracker.party_id), self.tracker.component_name, 'upload') os.makedirs(task_dir, exist_ok=True) process_cmd = [ sys.executable or 'python3', sys.modules[upload_utils.UploadFile.__module__].__file__, '--session_id', self.session_id, '--storage', self.storage_engine, '--file', block_file, '--namespace', table_list[block_index].get("namespace"), '--name', table_list[block_index].get("name"), '--partitions', self.parameters.get('partition') ] LOGGER.info(process_cmd) job_log_dir = os.path.join( job_utils.get_job_log_directory(job_id=self.tracker.job_id), self.tracker.role, str(self.tracker.party_id)) task_log_dir = os.path.join(job_log_dir, self.tracker.component_name, f'block_{block_index}') p = process_utils.run_subprocess(job_id=self.tracker.job_id, config_dir=task_dir, process_cmd=process_cmd, log_dir=task_log_dir) upload_process.append(p) self.check_upload_process(upload_process) self.union_table(table_list)
def run(job_id, component_name, task_id, task_version, role, party_id, task_parameters_path, task_info, **kwargs): process_cmd = [ sys.executable, sys.modules[TaskExecutor.__module__].__file__, '-j', job_id, '-n', component_name, '-t', task_id, '-v', task_version, '-r', role, '-p', party_id, '-c', task_parameters_path, '--run_ip', RuntimeConfig.JOB_SERVER_HOST, '--job_server', '{}:{}'.format(RuntimeConfig.JOB_SERVER_HOST, RuntimeConfig.HTTP_PORT), ] task_log_dir = os.path.join( job_utils.get_job_log_directory(job_id=job_id), role, party_id, component_name) task_job_dir = os.path.join(job_utils.get_job_directory(job_id=job_id), role, party_id, component_name) schedule_logger(job_id).info( 'job {} task {} {} on {} {} executor subprocess is ready'.format( job_id, task_id, task_version, role, party_id)) task_dir = os.path.dirname(task_parameters_path) p = job_utils.run_subprocess(job_id=job_id, config_dir=task_dir, process_cmd=process_cmd, log_dir=task_log_dir, job_dir=task_job_dir) task_info["run_pid"] = p.pid return p
def download_upload(data_func): request_config = request.json _job_id = generate_job_id() stat_logger.info('generated job_id {}, body {}'.format(_job_id, request_config)) _job_dir = get_job_directory(_job_id) os.makedirs(_job_dir, exist_ok=True) module = data_func required_arguments = ['work_mode', 'namespace', 'table_name'] if module == 'upload': required_arguments.extend(['file', 'head', 'partition']) elif module == 'download': required_arguments.extend(['output_path']) else: raise Exception('can not support this operating: {}'.format(module)) detect_utils.check_config(request_config, required_arguments=required_arguments) if module == "upload": if not os.path.isabs(request_config['file']): request_config["file"] = os.path.join(file_utils.get_project_base_directory(), request_config["file"]) try: conf_file_path = new_runtime_conf(job_dir=_job_dir, method=data_func, module=module, role=request_config.get('local', {}).get("role"), party_id=request_config.get('local', {}).get("party_id", '')) file_utils.dump_json_conf(request_config, conf_file_path) progs = ["python3", os.path.join(file_utils.get_project_base_directory(), JOB_MODULE_CONF[module]["module_path"]), "-j", _job_id, "-c", conf_file_path ] try: p = run_subprocess(config_dir=_job_dir, process_cmd=progs) except Exception as e: stat_logger.exception(e) p = None return get_json_result(retcode=(0 if p else 101), job_id=_job_id, data={'table_name': request_config['table_name'], 'namespace': request_config['namespace'], 'pid': p.pid if p else ''}) except Exception as e: stat_logger.exception(e) return get_json_result(retcode=-104, retmsg="failed", job_id=_job_id)
def start_task(cls, job_id, component_name, task_id, task_version, role, party_id, **kwargs): """ Start task, update status and party status :param job_id: :param component_name: :param task_id: :param task_version: :param role: :param party_id: :return: """ job_dsl = job_utils.get_job_dsl(job_id, role, party_id) PrivilegeAuth.authentication_component( job_dsl, src_party_id=kwargs.get('src_party_id'), src_role=kwargs.get('src_role'), party_id=party_id, component_name=component_name) schedule_logger(job_id).info( f"try to start task {task_id} {task_version} on {role} {party_id} executor subprocess" ) task_executor_process_start_status = False task_info = { "job_id": job_id, "task_id": task_id, "task_version": task_version, "role": role, "party_id": party_id, } is_failed = False try: task = JobSaver.query_task(task_id=task_id, task_version=task_version, role=role, party_id=party_id)[0] run_parameters_dict = job_utils.get_job_parameters( job_id, role, party_id) run_parameters_dict["src_user"] = kwargs.get("src_user") run_parameters = RunParameters(**run_parameters_dict) config_dir = job_utils.get_task_directory(job_id, role, party_id, component_name, task_id, task_version) os.makedirs(config_dir, exist_ok=True) run_parameters_path = os.path.join(config_dir, 'task_parameters.json') with open(run_parameters_path, 'w') as fw: fw.write(json_dumps(run_parameters_dict)) schedule_logger(job_id).info( f"use computing engine {run_parameters.computing_engine}") task_info["engine_conf"] = { "computing_engine": run_parameters.computing_engine } backend_engine = build_engine(run_parameters.computing_engine) run_info = backend_engine.run( task=task, run_parameters=run_parameters, run_parameters_path=run_parameters_path, config_dir=config_dir, log_dir=job_utils.get_job_log_directory( job_id, role, party_id, component_name), cwd_dir=job_utils.get_job_directory(job_id, role, party_id, component_name), user_name=kwargs.get("user_id")) task_info.update(run_info) task_info["start_time"] = current_timestamp() task_executor_process_start_status = True except Exception as e: schedule_logger(job_id).exception(e) is_failed = True finally: try: cls.update_task(task_info=task_info) task_info["party_status"] = TaskStatus.RUNNING cls.update_task_status(task_info=task_info) if is_failed: task_info["party_status"] = TaskStatus.FAILED cls.update_task_status(task_info=task_info) except Exception as e: schedule_logger(job_id).exception(e) schedule_logger(job_id).info( "task {} {} on {} {} executor subprocess start {}".format( task_id, task_version, role, party_id, "success" if task_executor_process_start_status else "failed"))
def run_task(job_id, component_name, task_id, role, party_id, task_config): schedule_logger(job_id).info( 'job {} {} {} {} task subprocess is ready'.format(job_id, component_name, role, party_id, task_config)) task_process_start_status = False try: task_dir = os.path.join(job_utils.get_job_directory(job_id=job_id), role, party_id, component_name) os.makedirs(task_dir, exist_ok=True) task_config_path = os.path.join(task_dir, 'task_config.json') with open(task_config_path, 'w') as fw: json.dump(task_config, fw) try: backend = task_config['job_parameters']['backend'] except KeyError: backend = 0 schedule_logger(job_id).warning("failed to get backend, set as 0") backend = Backend(backend) if backend.is_eggroll(): process_cmd = [ 'python3', sys.modules[TaskExecutor.__module__].__file__, '-j', job_id, '-n', component_name, '-t', task_id, '-r', role, '-p', party_id, '-c', task_config_path, '--processors_per_node', str(task_config['job_parameters'].get("processors_per_node", 0)), '--job_server', '{}:{}'.format(get_lan_ip(), HTTP_PORT), ] elif backend.is_spark(): if "SPARK_HOME" not in os.environ: raise EnvironmentError("SPARK_HOME not found") spark_home = os.environ["SPARK_HOME"] # additional configs spark_submit_config = task_config['job_parameters'].get("spark_submit_config", dict()) deploy_mode = spark_submit_config.get("deploy-mode", "client") if deploy_mode not in ["client"]: raise ValueError(f"deploy mode {deploy_mode} not supported") spark_submit_cmd = os.path.join(spark_home, "bin/spark-submit") process_cmd = [spark_submit_cmd, f'--name={task_id}#{role}'] for k, v in spark_submit_config.items(): if k != "conf": process_cmd.append(f'--{k}={v}') if "conf" in spark_submit_config: for ck, cv in spark_submit_config["conf"].items(): process_cmd.append(f'--conf') process_cmd.append(f'{ck}={cv}') process_cmd.extend([ sys.modules[TaskExecutor.__module__].__file__, '-j', job_id, '-n', component_name, '-t', task_id, '-r', role, '-p', party_id, '-c', task_config_path, '--job_server', '{}:{}'.format(get_lan_ip(), HTTP_PORT), ]) else: raise ValueError(f"${backend} supported") task_log_dir = os.path.join(job_utils.get_job_log_directory(job_id=job_id), role, party_id, component_name) schedule_logger(job_id).info( 'job {} {} {} {} task subprocess start'.format(job_id, component_name, role, party_id, task_config)) p = job_utils.run_subprocess(config_dir=task_dir, process_cmd=process_cmd, log_dir=task_log_dir) if p: task_process_start_status = True except Exception as e: schedule_logger(job_id).exception(e) finally: schedule_logger(job_id).info( 'job {} component {} on {} {} start task subprocess {}'.format(job_id, component_name, role, party_id, 'success' if task_process_start_status else 'failed'))
def download_upload(access_module): job_id = generate_job_id() if access_module == "upload" and USE_LOCAL_DATA and not ( request.json and request.json.get("use_local_data") == 0): file = request.files['file'] filename = os.path.join(get_job_directory(job_id), 'fate_upload_tmp', file.filename) os.makedirs(os.path.dirname(filename), exist_ok=True) try: file.save(filename) except Exception as e: shutil.rmtree(os.path.join(get_job_directory(job_id), 'tmp')) raise e request_config = request.args.to_dict() request_config['file'] = filename else: request_config = request.json required_arguments = ['work_mode', 'namespace', 'table_name'] if access_module == 'upload': required_arguments.extend(['file', 'head', 'partition']) elif access_module == 'download': required_arguments.extend(['output_path']) elif access_module == 'download_test': required_arguments.extend(['output_path']) else: raise Exception( 'can not support this operating: {}'.format(access_module)) detect_utils.check_config(request_config, required_arguments=required_arguments) data = {} if access_module == "upload": data['table_name'] = request_config["table_name"] data['namespace'] = request_config["namespace"] if WORK_MODE != 0: data_table = session.get_data_table( name=request_config["table_name"], namespace=request_config["namespace"]) count = data_table.count() if count and int(request_config.get('drop', 2)) == 2: return get_json_result( retcode=100, retmsg='The data table already exists, table data count:{}.' 'If you still want to continue uploading, please add the parameter -drop. ' '0 means not to delete and continue uploading, ' '1 means to upload again after deleting the table'.format( count)) elif count and int(request_config.get('drop', 2)) == 1: data_table.destroy() job_dsl, job_runtime_conf = gen_data_access_job_config( request_config, access_module) job_id, job_dsl_path, job_runtime_conf_path, logs_directory, model_info, board_url = JobController.submit_job( { 'job_dsl': job_dsl, 'job_runtime_conf': job_runtime_conf }, job_id=job_id) data.update({ 'job_dsl_path': job_dsl_path, 'job_runtime_conf_path': job_runtime_conf_path, 'board_url': board_url, 'logs_directory': logs_directory }) return get_json_result(job_id=job_id, data=data)
def download_upload(data_func): request_config = request.json _job_id = generate_job_id() stat_logger.info('generated job_id {}, body {}'.format( _job_id, request_config)) _job_dir = get_job_directory(_job_id) os.makedirs(_job_dir, exist_ok=True) module = data_func required_arguments = ['work_mode', 'namespace', 'table_name'] if module == 'upload': required_arguments.extend(['file', 'head', 'partition']) elif module == 'download': required_arguments.extend(['output_path']) else: raise Exception('can not support this operating: {}'.format(module)) detect_utils.check_config(request_config, required_arguments=required_arguments) job_work_mode = request_config['work_mode'] # todo: The current code here is redundant with job_app/submit_job, the next version of this function will be implemented by job_app/submit_job if job_work_mode != RuntimeConfig.WORK_MODE: if RuntimeConfig.WORK_MODE == WorkMode.CLUSTER and job_work_mode == WorkMode.STANDALONE: # use cluster standalone job server to execute standalone job return request_execute_server( request=request, execute_host='{}:{}'.format( request.remote_addr, CLUSTER_STANDALONE_JOB_SERVER_PORT)) else: raise Exception( 'server run on standalone can not support cluster mode job') if module == "upload": if not os.path.isabs(request_config['file']): request_config["file"] = os.path.join( file_utils.get_project_base_directory(), request_config["file"]) try: conf_file_path = new_runtime_conf( job_dir=_job_dir, method=data_func, module=module, role=request_config.get('local', {}).get("role"), party_id=request_config.get('local', {}).get("party_id", '')) file_utils.dump_json_conf(request_config, conf_file_path) progs = [ "python3", os.path.join(file_utils.get_project_base_directory(), JOB_MODULE_CONF[module]["module_path"]), "-j", _job_id, "-c", conf_file_path ] try: p = run_subprocess(config_dir=_job_dir, process_cmd=progs) except Exception as e: stat_logger.exception(e) p = None return get_json_result(retcode=(0 if p else 101), job_id=_job_id, data={ 'table_name': request_config['table_name'], 'namespace': request_config['namespace'], 'pid': p.pid if p else '' }) except Exception as e: stat_logger.exception(e) return get_json_result(retcode=-104, retmsg="failed", job_id=_job_id)
def start_task(cls, job_id, component_name, task_id, task_version, role, party_id): """ Start task, update status and party status :param job_id: :param component_name: :param task_id: :param task_version: :param role: :param party_id: :return: """ schedule_logger(job_id).info( 'try to start job {} task {} {} on {} {} executor subprocess'. format(job_id, task_id, task_version, role, party_id)) task_executor_process_start_status = False task_info = { "job_id": job_id, "task_id": task_id, "task_version": task_version, "role": role, "party_id": party_id, } try: task_dir = os.path.join(job_utils.get_job_directory(job_id=job_id), role, party_id, component_name, task_id, task_version) os.makedirs(task_dir, exist_ok=True) task_parameters_path = os.path.join(task_dir, 'task_parameters.json') run_parameters_dict = job_utils.get_job_parameters( job_id, role, party_id) with open(task_parameters_path, 'w') as fw: fw.write(json_dumps(run_parameters_dict)) run_parameters = RunParameters(**run_parameters_dict) schedule_logger(job_id=job_id).info( f"use computing engine {run_parameters.computing_engine}") if run_parameters.computing_engine in { ComputingEngine.EGGROLL, ComputingEngine.STANDALONE }: process_cmd = [ sys.executable, sys.modules[TaskExecutor.__module__].__file__, '-j', job_id, '-n', component_name, '-t', task_id, '-v', task_version, '-r', role, '-p', party_id, '-c', task_parameters_path, '--run_ip', RuntimeConfig.JOB_SERVER_HOST, '--job_server', '{}:{}'.format(RuntimeConfig.JOB_SERVER_HOST, RuntimeConfig.HTTP_PORT), ] elif run_parameters.computing_engine == ComputingEngine.SPARK: if "SPARK_HOME" not in os.environ: raise EnvironmentError("SPARK_HOME not found") spark_home = os.environ["SPARK_HOME"] # additional configs spark_submit_config = run_parameters.spark_run deploy_mode = spark_submit_config.get("deploy-mode", "client") if deploy_mode not in ["client"]: raise ValueError( f"deploy mode {deploy_mode} not supported") spark_submit_cmd = os.path.join(spark_home, "bin/spark-submit") process_cmd = [spark_submit_cmd, f'--name={task_id}#{role}'] for k, v in spark_submit_config.items(): if k != "conf": process_cmd.append(f'--{k}={v}') if "conf" in spark_submit_config: for ck, cv in spark_submit_config["conf"].items(): process_cmd.append(f'--conf') process_cmd.append(f'{ck}={cv}') process_cmd.extend([ sys.modules[TaskExecutor.__module__].__file__, '-j', job_id, '-n', component_name, '-t', task_id, '-v', task_version, '-r', role, '-p', party_id, '-c', task_parameters_path, '--run_ip', RuntimeConfig.JOB_SERVER_HOST, '--job_server', '{}:{}'.format(RuntimeConfig.JOB_SERVER_HOST, RuntimeConfig.HTTP_PORT), ]) else: raise ValueError( f"${run_parameters.computing_engine} is not supported") task_log_dir = os.path.join( job_utils.get_job_log_directory(job_id=job_id), role, party_id, component_name) schedule_logger(job_id).info( 'job {} task {} {} on {} {} executor subprocess is ready'. format(job_id, task_id, task_version, role, party_id)) p = job_utils.run_subprocess(job_id=job_id, config_dir=task_dir, process_cmd=process_cmd, log_dir=task_log_dir) if p: task_info["party_status"] = TaskStatus.RUNNING #task_info["run_pid"] = p.pid task_info["start_time"] = current_timestamp() task_executor_process_start_status = True else: task_info["party_status"] = TaskStatus.FAILED except Exception as e: schedule_logger(job_id).exception(e) task_info["party_status"] = TaskStatus.FAILED finally: try: cls.update_task(task_info=task_info) cls.update_task_status(task_info=task_info) except Exception as e: schedule_logger(job_id).exception(e) schedule_logger(job_id).info( 'job {} task {} {} on {} {} executor subprocess start {}'. format( job_id, task_id, task_version, role, party_id, "success" if task_executor_process_start_status else "failed"))
def start_task(job_id, component_name, task_id, role, party_id, task_config): schedule_logger(job_id).info( 'job {} {} {} {} task subprocess is ready'.format( job_id, component_name, role, party_id, task_config)) task_process_start_status = False try: task_dir = os.path.join(job_utils.get_job_directory(job_id=job_id), role, party_id, component_name) os.makedirs(task_dir, exist_ok=True) task_config_path = os.path.join(task_dir, 'task_config.json') with open(task_config_path, 'w') as fw: json.dump(task_config, fw) try: backend = task_config['job_parameters']['backend'] except KeyError: backend = 0 schedule_logger(job_id).warning( "failed to get backend, set as 0") backend = Backend(backend) if backend.is_eggroll() or backend.is_eggroll2(): process_cmd = [ 'python3', sys.modules[TaskExecutor.__module__].__file__, '-j', job_id, '-n', component_name, '-t', task_id, '-r', role, '-p', party_id, '-c', task_config_path, '--job_server', '{}:{}'.format(task_config['job_server']['ip'], task_config['job_server']['http_port']), ] elif backend.is_spark(): if "SPARK_HOME" not in os.environ: raise EnvironmentError("SPARK_HOME not found") spark_submit_config = task_config['job_parameters'].get( "spark_submit_config", dict()) deploy_mode = spark_submit_config.get("deploy-mode", "client") queue = spark_submit_config.get("queue", "default") driver_memory = spark_submit_config.get("driver-memory", "1g") num_executors = spark_submit_config.get("num-executors", 2) executor_memory = spark_submit_config.get( "executor-memory", "1g") executor_cores = spark_submit_config.get("executor-cores", 1) if deploy_mode not in ["client"]: raise ValueError( f"deploy mode {deploy_mode} not supported") spark_home = os.environ["SPARK_HOME"] spark_submit_cmd = os.path.join(spark_home, "bin/spark-submit") process_cmd = [ spark_submit_cmd, f'--name={task_id}#{role}', f'--deploy-mode={deploy_mode}', f'--queue={queue}', f'--driver-memory={driver_memory}', f'--num-executors={num_executors}', f'--executor-memory={executor_memory}', f'--executor-cores={executor_cores}', sys.modules[TaskExecutor.__module__].__file__, '-j', job_id, '-n', component_name, '-t', task_id, '-r', role, '-p', party_id, '-c', task_config_path, '--job_server', '{}:{}'.format(task_config['job_server']['ip'], task_config['job_server']['http_port']), ] else: raise ValueError(f"${backend} supported") task_log_dir = os.path.join( job_utils.get_job_log_directory(job_id=job_id), role, party_id, component_name) schedule_logger(job_id).info( 'job {} {} {} {} task subprocess start'.format( job_id, component_name, role, party_id, task_config)) p = job_utils.run_subprocess(config_dir=task_dir, process_cmd=process_cmd, log_dir=task_log_dir) if p: task_process_start_status = True except Exception as e: schedule_logger(job_id).exception(e) finally: schedule_logger(job_id).info( 'job {} component {} on {} {} start task subprocess {}'.format( job_id, component_name, role, party_id, 'success' if task_process_start_status else 'failed'))
def run_task(): # signal.signal(signal.SIGTERM, job_utils.onsignal_term) task = Task() task.f_create_time = current_timestamp() try: parser = argparse.ArgumentParser() parser.add_argument('-j', '--job_id', required=True, type=str, help="job id") parser.add_argument('-n', '--component_name', required=True, type=str, help="component name") parser.add_argument('-t', '--task_id', required=True, type=str, help="task id") parser.add_argument('-r', '--role', required=True, type=str, help="role") parser.add_argument('-p', '--party_id', required=True, type=str, help="party id") parser.add_argument('-c', '--config', required=True, type=str, help="task config") parser.add_argument('--job_server', help="job server", type=str) args = parser.parse_args() schedule_logger(args.job_id).info('enter task process') schedule_logger(args.job_id).info(args) # init function args if args.job_server: RuntimeConfig.init_config( HTTP_PORT=args.job_server.split(':')[1]) job_id = args.job_id component_name = args.component_name task_id = args.task_id role = args.role party_id = int(args.party_id) executor_pid = os.getpid() #job_utils.task_killed_detector(job_id, role, party_id, component_name, executor_pid) task_config = file_utils.load_json_conf(args.config) job_parameters = task_config['job_parameters'] job_initiator = task_config['job_initiator'] job_args = task_config['job_args'] task_input_dsl = task_config['input'] task_output_dsl = task_config['output'] parameters = TaskExecutor.get_parameters(job_id, component_name, role, party_id) # parameters = task_config['parameters'] module_name = task_config['module_name'] except Exception as e: traceback.print_exc() schedule_logger().exception(e) task.f_status = TaskStatus.FAILED return try: # init environment, process is shared globally RuntimeConfig.init_config(WORK_MODE=job_parameters['work_mode'], BACKEND=2) # BACKEND=job_parameters.get('backend', 2)) session.init(job_id='{}_{}_{}'.format(task_id, role, party_id), mode=RuntimeConfig.WORK_MODE, backend=RuntimeConfig.BACKEND) federation.init(job_id=task_id, runtime_conf=parameters) job_log_dir = os.path.join( job_utils.get_job_log_directory(job_id=job_id), role, str(party_id)) task_log_dir = os.path.join(job_log_dir, component_name) log_utils.LoggerFactory.set_directory(directory=task_log_dir, parent_log_dir=job_log_dir, append_to_parent_log=True, force=True) task.f_job_id = job_id task.f_component_name = component_name task.f_task_id = task_id task.f_role = role task.f_party_id = party_id task.f_operator = 'python_operator' tracker = Tracking(job_id=job_id, role=role, party_id=party_id, component_name=component_name, task_id=task_id, model_id=job_parameters['model_id'], model_version=job_parameters['model_version'], module_name=module_name) task.f_start_time = current_timestamp() task.f_run_ip = get_lan_ip() task.f_run_pid = executor_pid run_class_paths = parameters.get('CodePath').split('/') run_class_package = '.'.join( run_class_paths[:-2]) + '.' + run_class_paths[-2].replace( '.py', '') run_class_name = run_class_paths[-1] task_run_args = TaskExecutor.get_task_run_args( job_id=job_id, role=role, party_id=party_id, job_parameters=job_parameters, job_args=job_args, input_dsl=task_input_dsl) run_object = getattr(importlib.import_module(run_class_package), run_class_name)() run_object.set_tracker(tracker=tracker) run_object.set_taskid(taskid=task_id) task.f_status = TaskStatus.RUNNING TaskExecutor.sync_task_status( job_id=job_id, component_name=component_name, task_id=task_id, role=role, party_id=party_id, initiator_party_id=job_initiator.get('party_id', None), initiator_role=job_initiator.get('role', None), task_info=task.to_json()) schedule_logger().info('run {} {} {} {} {} task'.format( job_id, component_name, task_id, role, party_id)) schedule_logger().info(parameters) schedule_logger().info(task_input_dsl) run_object.run(parameters, task_run_args) output_data = run_object.save_data() tracker.save_output_data_table( output_data, task_output_dsl.get('data')[0] if task_output_dsl.get('data') else 'component') output_model = run_object.export_model() # There is only one model output at the current dsl version. tracker.save_output_model( output_model, task_output_dsl['model'][0] if task_output_dsl.get('model') else 'default') task.f_status = TaskStatus.SUCCESS except Exception as e: traceback.print_exc() schedule_logger().exception(e) task.f_status = TaskStatus.FAILED finally: sync_success = False try: task.f_end_time = current_timestamp() task.f_elapsed = task.f_end_time - task.f_start_time task.f_update_time = current_timestamp() TaskExecutor.sync_task_status( job_id=job_id, component_name=component_name, task_id=task_id, role=role, party_id=party_id, initiator_party_id=job_initiator.get('party_id', None), initiator_role=job_initiator.get('role', None), task_info=task.to_json()) sync_success = True except Exception as e: traceback.print_exc() schedule_logger().exception(e) schedule_logger().info('finish {} {} {} {} {} {} task'.format( job_id, component_name, task_id, role, party_id, task.f_status if sync_success else TaskStatus.FAILED)) print('finish {} {} {} {} {} {} task'.format( job_id, component_name, task_id, role, party_id, task.f_status if sync_success else TaskStatus.FAILED)) while True: time.sleep(0.5) kill_path = os.path.join(job_utils.get_job_directory(job_id), str(role), str(party_id), component_name, 'kill') if os.path.exists(kill_path): try: session.stop() except Exception as e: pass break