def detect_running_task(cls): detect_logger().info('start to detect running task..') count = 0 try: running_tasks = JobSaver.query_task( party_status=TaskStatus.RUNNING, only_latest=False) stop_job_ids = set() for task in running_tasks: if not task.f_engine_conf and task.f_run_ip != RuntimeConfig.JOB_SERVER_HOST and not task.f_run_on_this_party: continue count += 1 try: process_exist = build_engine( task.f_engine_conf.get("computing_engine")).is_alive( task) if not process_exist: msg = f"task {task.f_task_id} {task.f_task_version} on {task.f_role} {task.f_party_id}" detect_logger(job_id=task.f_job_id).info( f"{msg} with {task.f_party_status} process {task.f_run_pid} does not exist" ) time.sleep(3) _tasks = JobSaver.query_task( task_id=task.f_task_id, task_version=task.f_task_version, role=task.f_role, party_id=task.f_party_id) if _tasks: if _tasks[0].f_party_status == TaskStatus.RUNNING: stop_job_ids.add(task.f_job_id) detect_logger(task.f_job_id).info( f"{msg} party status has been checked twice, try to stop job" ) else: detect_logger(task.f_job_id).info( f"{msg} party status has changed to {_tasks[0].f_party_status}, may be stopped by task_controller.stop_task, pass stop job again" ) else: detect_logger(task.f_job_id).warning( f"{msg} can not found on db") except Exception as e: detect_logger(job_id=task.f_job_id).exception(e) if stop_job_ids: detect_logger().info( 'start to stop jobs: {}'.format(stop_job_ids)) stop_jobs = set() for job_id in stop_job_ids: jobs = JobSaver.query_job(job_id=job_id) if jobs: stop_jobs.add(jobs[0]) cls.request_stop_jobs(jobs=stop_jobs, stop_msg="task executor process abort", stop_status=JobStatus.FAILED) except Exception as e: detect_logger().exception(e) finally: detect_logger().info(f"finish detect {count} running task")
def kill_task(cls, task: Task): kill_status = False try: backend_engine = build_engine( task.f_engine_conf.get("computing_engine")) if backend_engine: backend_engine.kill(task) except Exception as e: schedule_logger(task.f_job_id).exception(e) else: kill_status = True finally: schedule_logger(task.f_job_id).info( 'job {} task {} {} on {} {} process {} kill {}'.format( task.f_job_id, task.f_task_id, task.f_task_version, task.f_role, task.f_party_id, task.f_run_pid, 'success' if kill_status else 'failed')) return kill_status
def detect_running_task(cls): detect_logger().info('start to detect running task..') count = 0 try: running_tasks = JobSaver.query_task( party_status=TaskStatus.RUNNING, only_latest=False) stop_job_ids = set() for task in running_tasks: if not task.f_engine_conf and task.f_run_ip != RuntimeConfig.JOB_SERVER_HOST and not task.f_run_on_this_party: continue count += 1 try: process_exist = build_engine( task.f_engine_conf.get("computing_engine")).is_alive( task) if not process_exist: detect_logger(job_id=task.f_job_id).info( 'job {} task {} {} on {} {} process {} does not exist' .format(task.f_job_id, task.f_task_id, task.f_task_version, task.f_role, task.f_party_id, task.f_run_pid)) stop_job_ids.add(task.f_job_id) except Exception as e: detect_logger(job_id=task.f_job_id).exception(e) if stop_job_ids: detect_logger().info( 'start to stop jobs: {}'.format(stop_job_ids)) stop_jobs = set() for job_id in stop_job_ids: jobs = JobSaver.query_job(job_id=job_id) if jobs: stop_jobs.add(jobs[0]) cls.request_stop_jobs(jobs=stop_jobs, stop_msg="task executor process abort", stop_status=JobStatus.CANCELED) except Exception as e: detect_logger().exception(e) finally: detect_logger().info(f"finish detect {count} running task")
def start_task(cls, job_id, component_name, task_id, task_version, role, party_id, **kwargs): """ Start task, update status and party status :param job_id: :param component_name: :param task_id: :param task_version: :param role: :param party_id: :return: """ job_dsl = job_utils.get_job_dsl(job_id, role, party_id) PrivilegeAuth.authentication_component( job_dsl, src_party_id=kwargs.get('src_party_id'), src_role=kwargs.get('src_role'), party_id=party_id, component_name=component_name) schedule_logger(job_id).info( f"try to start task {task_id} {task_version} on {role} {party_id} executor subprocess" ) task_executor_process_start_status = False task_info = { "job_id": job_id, "task_id": task_id, "task_version": task_version, "role": role, "party_id": party_id, } is_failed = False try: task = JobSaver.query_task(task_id=task_id, task_version=task_version, role=role, party_id=party_id)[0] run_parameters_dict = job_utils.get_job_parameters( job_id, role, party_id) run_parameters_dict["src_user"] = kwargs.get("src_user") run_parameters = RunParameters(**run_parameters_dict) config_dir = job_utils.get_task_directory(job_id, role, party_id, component_name, task_id, task_version) os.makedirs(config_dir, exist_ok=True) run_parameters_path = os.path.join(config_dir, 'task_parameters.json') with open(run_parameters_path, 'w') as fw: fw.write(json_dumps(run_parameters_dict)) schedule_logger(job_id).info( f"use computing engine {run_parameters.computing_engine}") task_info["engine_conf"] = { "computing_engine": run_parameters.computing_engine } backend_engine = build_engine(run_parameters.computing_engine) run_info = backend_engine.run( task=task, run_parameters=run_parameters, run_parameters_path=run_parameters_path, config_dir=config_dir, log_dir=job_utils.get_job_log_directory( job_id, role, party_id, component_name), cwd_dir=job_utils.get_job_directory(job_id, role, party_id, component_name), user_name=kwargs.get("user_id")) task_info.update(run_info) task_info["start_time"] = current_timestamp() task_executor_process_start_status = True except Exception as e: schedule_logger(job_id).exception(e) is_failed = True finally: try: cls.update_task(task_info=task_info) task_info["party_status"] = TaskStatus.RUNNING cls.update_task_status(task_info=task_info) if is_failed: task_info["party_status"] = TaskStatus.FAILED cls.update_task_status(task_info=task_info) except Exception as e: schedule_logger(job_id).exception(e) schedule_logger(job_id).info( "task {} {} on {} {} executor subprocess start {}".format( task_id, task_version, role, party_id, "success" if task_executor_process_start_status else "failed"))