Example #1
0
 def detect_running_task(cls):
     detect_logger().info('start to detect running task..')
     count = 0
     try:
         running_tasks = JobSaver.query_task(
             party_status=TaskStatus.RUNNING, only_latest=False)
         stop_job_ids = set()
         for task in running_tasks:
             if not task.f_engine_conf and task.f_run_ip != RuntimeConfig.JOB_SERVER_HOST and not task.f_run_on_this_party:
                 continue
             count += 1
             try:
                 process_exist = build_engine(
                     task.f_engine_conf.get("computing_engine")).is_alive(
                         task)
                 if not process_exist:
                     msg = f"task {task.f_task_id} {task.f_task_version} on {task.f_role} {task.f_party_id}"
                     detect_logger(job_id=task.f_job_id).info(
                         f"{msg} with {task.f_party_status} process {task.f_run_pid} does not exist"
                     )
                     time.sleep(3)
                     _tasks = JobSaver.query_task(
                         task_id=task.f_task_id,
                         task_version=task.f_task_version,
                         role=task.f_role,
                         party_id=task.f_party_id)
                     if _tasks:
                         if _tasks[0].f_party_status == TaskStatus.RUNNING:
                             stop_job_ids.add(task.f_job_id)
                             detect_logger(task.f_job_id).info(
                                 f"{msg} party status has been checked twice, try to stop job"
                             )
                         else:
                             detect_logger(task.f_job_id).info(
                                 f"{msg} party status has changed to {_tasks[0].f_party_status}, may be stopped by task_controller.stop_task, pass stop job again"
                             )
                     else:
                         detect_logger(task.f_job_id).warning(
                             f"{msg} can not found on db")
             except Exception as e:
                 detect_logger(job_id=task.f_job_id).exception(e)
         if stop_job_ids:
             detect_logger().info(
                 'start to stop jobs: {}'.format(stop_job_ids))
         stop_jobs = set()
         for job_id in stop_job_ids:
             jobs = JobSaver.query_job(job_id=job_id)
             if jobs:
                 stop_jobs.add(jobs[0])
         cls.request_stop_jobs(jobs=stop_jobs,
                               stop_msg="task executor process abort",
                               stop_status=JobStatus.FAILED)
     except Exception as e:
         detect_logger().exception(e)
     finally:
         detect_logger().info(f"finish detect {count} running task")
Example #2
0
 def kill_task(cls, task: Task):
     kill_status = False
     try:
         backend_engine = build_engine(
             task.f_engine_conf.get("computing_engine"))
         if backend_engine:
             backend_engine.kill(task)
     except Exception as e:
         schedule_logger(task.f_job_id).exception(e)
     else:
         kill_status = True
     finally:
         schedule_logger(task.f_job_id).info(
             'job {} task {} {} on {} {} process {} kill {}'.format(
                 task.f_job_id, task.f_task_id, task.f_task_version,
                 task.f_role, task.f_party_id, task.f_run_pid,
                 'success' if kill_status else 'failed'))
         return kill_status
Example #3
0
 def detect_running_task(cls):
     detect_logger().info('start to detect running task..')
     count = 0
     try:
         running_tasks = JobSaver.query_task(
             party_status=TaskStatus.RUNNING, only_latest=False)
         stop_job_ids = set()
         for task in running_tasks:
             if not task.f_engine_conf and task.f_run_ip != RuntimeConfig.JOB_SERVER_HOST and not task.f_run_on_this_party:
                 continue
             count += 1
             try:
                 process_exist = build_engine(
                     task.f_engine_conf.get("computing_engine")).is_alive(
                         task)
                 if not process_exist:
                     detect_logger(job_id=task.f_job_id).info(
                         'job {} task {} {} on {} {} process {} does not exist'
                         .format(task.f_job_id, task.f_task_id,
                                 task.f_task_version, task.f_role,
                                 task.f_party_id, task.f_run_pid))
                     stop_job_ids.add(task.f_job_id)
             except Exception as e:
                 detect_logger(job_id=task.f_job_id).exception(e)
         if stop_job_ids:
             detect_logger().info(
                 'start to stop jobs: {}'.format(stop_job_ids))
         stop_jobs = set()
         for job_id in stop_job_ids:
             jobs = JobSaver.query_job(job_id=job_id)
             if jobs:
                 stop_jobs.add(jobs[0])
         cls.request_stop_jobs(jobs=stop_jobs,
                               stop_msg="task executor process abort",
                               stop_status=JobStatus.CANCELED)
     except Exception as e:
         detect_logger().exception(e)
     finally:
         detect_logger().info(f"finish detect {count} running task")
Example #4
0
    def start_task(cls, job_id, component_name, task_id, task_version, role,
                   party_id, **kwargs):
        """
        Start task, update status and party status
        :param job_id:
        :param component_name:
        :param task_id:
        :param task_version:
        :param role:
        :param party_id:
        :return:
        """
        job_dsl = job_utils.get_job_dsl(job_id, role, party_id)
        PrivilegeAuth.authentication_component(
            job_dsl,
            src_party_id=kwargs.get('src_party_id'),
            src_role=kwargs.get('src_role'),
            party_id=party_id,
            component_name=component_name)

        schedule_logger(job_id).info(
            f"try to start task {task_id} {task_version} on {role} {party_id} executor subprocess"
        )
        task_executor_process_start_status = False
        task_info = {
            "job_id": job_id,
            "task_id": task_id,
            "task_version": task_version,
            "role": role,
            "party_id": party_id,
        }
        is_failed = False
        try:
            task = JobSaver.query_task(task_id=task_id,
                                       task_version=task_version,
                                       role=role,
                                       party_id=party_id)[0]
            run_parameters_dict = job_utils.get_job_parameters(
                job_id, role, party_id)
            run_parameters_dict["src_user"] = kwargs.get("src_user")
            run_parameters = RunParameters(**run_parameters_dict)

            config_dir = job_utils.get_task_directory(job_id, role, party_id,
                                                      component_name, task_id,
                                                      task_version)
            os.makedirs(config_dir, exist_ok=True)

            run_parameters_path = os.path.join(config_dir,
                                               'task_parameters.json')
            with open(run_parameters_path, 'w') as fw:
                fw.write(json_dumps(run_parameters_dict))

            schedule_logger(job_id).info(
                f"use computing engine {run_parameters.computing_engine}")
            task_info["engine_conf"] = {
                "computing_engine": run_parameters.computing_engine
            }
            backend_engine = build_engine(run_parameters.computing_engine)
            run_info = backend_engine.run(
                task=task,
                run_parameters=run_parameters,
                run_parameters_path=run_parameters_path,
                config_dir=config_dir,
                log_dir=job_utils.get_job_log_directory(
                    job_id, role, party_id, component_name),
                cwd_dir=job_utils.get_job_directory(job_id, role, party_id,
                                                    component_name),
                user_name=kwargs.get("user_id"))
            task_info.update(run_info)
            task_info["start_time"] = current_timestamp()
            task_executor_process_start_status = True
        except Exception as e:
            schedule_logger(job_id).exception(e)
            is_failed = True
        finally:
            try:
                cls.update_task(task_info=task_info)
                task_info["party_status"] = TaskStatus.RUNNING
                cls.update_task_status(task_info=task_info)
                if is_failed:
                    task_info["party_status"] = TaskStatus.FAILED
                    cls.update_task_status(task_info=task_info)
            except Exception as e:
                schedule_logger(job_id).exception(e)
            schedule_logger(job_id).info(
                "task {} {} on {} {} executor subprocess start {}".format(
                    task_id, task_version, role, party_id, "success"
                    if task_executor_process_start_status else "failed"))