Ejemplo n.º 1
0
 def start_inheriting_job(cls, job):
     JobSaver.update_job(
         job_info={
             "job_id": job.f_job_id,
             "role": job.f_role,
             "party_id": job.f_party_id,
             "inheritance_status": JobInheritanceStatus.RUNNING
         })
     conf_dir = job_utils.get_job_directory(job_id=job.f_job_id)
     os.makedirs(conf_dir, exist_ok=True)
     process_cmd = [
         sys.executable or 'python3',
         sys.modules[JobInherit.__module__].__file__,
         '--job_id',
         job.f_job_id,
         '--role',
         job.f_role,
         '--party_id',
         job.f_party_id,
     ]
     log_dir = os.path.join(
         job_utils.get_job_log_directory(job_id=job.f_job_id),
         "job_inheritance")
     p = process_utils.run_subprocess(job_id=job.f_job_id,
                                      config_dir=conf_dir,
                                      process_cmd=process_cmd,
                                      log_dir=log_dir,
                                      process_name="job_inheritance")
Ejemplo n.º 2
0
def pipeline_dag_dependency(job_info):
    try:
        detect_utils.check_config(job_info, required_arguments=["party_id", "role"])
        component_need_run = {}
        if job_info.get('job_id'):
            jobs = JobSaver.query_job(job_id=job_info["job_id"], party_id=job_info["party_id"], role=job_info["role"])
            if not jobs:
                raise Exception('query job {} failed'.format(job_info.get('job_id', '')))
            job = jobs[0]
            dsl_parser = schedule_utils.get_job_dsl_parser(dsl=job.f_dsl,
                                                           runtime_conf=job.f_runtime_conf_on_party,
                                                           train_runtime_conf=job.f_train_runtime_conf)
            tasks = JobSaver.query_task(job_id=job_info["job_id"], party_id=job_info["party_id"], role=job_info["role"], only_latest=True)
            for task in tasks:
                need_run = task.f_component_parameters.get("ComponentParam", {}).get("need_run", True)
                component_need_run[task.f_component_name] = need_run
        else:
            dsl_parser = schedule_utils.get_job_dsl_parser(dsl=job_info.get('job_dsl', {}),
                                                           runtime_conf=job_info.get('job_runtime_conf', {}),
                                                           train_runtime_conf=job_info.get('job_train_runtime_conf', {}))
        dependency = dsl_parser.get_dependency()
        dependency["component_need_run"] = component_need_run
        return dependency
    except Exception as e:
        stat_logger.exception(e)
        raise e
Ejemplo n.º 3
0
    def update_parameter(cls, job_id, role, party_id,
                         updated_parameters: dict):
        job_configuration = job_utils.get_job_configuration(job_id=job_id,
                                                            role=role,
                                                            party_id=party_id)
        job_parameters = updated_parameters.get("job_parameters")
        component_parameters = updated_parameters.get("component_parameters")
        if job_parameters:
            job_configuration.runtime_conf["job_parameters"] = job_parameters
            job_parameters = RunParameters(**job_parameters["common"])
            cls.create_job_parameters_on_party(role=role,
                                               party_id=party_id,
                                               job_parameters=job_parameters)
            job_configuration.runtime_conf_on_party[
                "job_parameters"] = job_parameters.to_dict()
        if component_parameters:
            job_configuration.runtime_conf[
                "component_parameters"] = component_parameters
            job_configuration.runtime_conf_on_party[
                "component_parameters"] = component_parameters

        job_info = {}
        job_info["job_id"] = job_id
        job_info["role"] = role
        job_info["party_id"] = party_id
        job_info["runtime_conf"] = job_configuration.runtime_conf
        job_info[
            "runtime_conf_on_party"] = job_configuration.runtime_conf_on_party
        JobSaver.update_job(job_info)
Ejemplo n.º 4
0
 def detect_running_task(cls):
     detect_logger().info('start to detect running task..')
     count = 0
     try:
         running_tasks = JobSaver.query_task(
             party_status=TaskStatus.RUNNING, only_latest=False)
         stop_job_ids = set()
         for task in running_tasks:
             if not task.f_engine_conf and task.f_run_ip != RuntimeConfig.JOB_SERVER_HOST and not task.f_run_on_this_party:
                 continue
             count += 1
             try:
                 process_exist = build_engine(
                     task.f_engine_conf.get("computing_engine")).is_alive(
                         task)
                 if not process_exist:
                     msg = f"task {task.f_task_id} {task.f_task_version} on {task.f_role} {task.f_party_id}"
                     detect_logger(job_id=task.f_job_id).info(
                         f"{msg} with {task.f_party_status} process {task.f_run_pid} does not exist"
                     )
                     time.sleep(3)
                     _tasks = JobSaver.query_task(
                         task_id=task.f_task_id,
                         task_version=task.f_task_version,
                         role=task.f_role,
                         party_id=task.f_party_id)
                     if _tasks:
                         if _tasks[0].f_party_status == TaskStatus.RUNNING:
                             stop_job_ids.add(task.f_job_id)
                             detect_logger(task.f_job_id).info(
                                 f"{msg} party status has been checked twice, try to stop job"
                             )
                         else:
                             detect_logger(task.f_job_id).info(
                                 f"{msg} party status has changed to {_tasks[0].f_party_status}, may be stopped by task_controller.stop_task, pass stop job again"
                             )
                     else:
                         detect_logger(task.f_job_id).warning(
                             f"{msg} can not found on db")
             except Exception as e:
                 detect_logger(job_id=task.f_job_id).exception(e)
         if stop_job_ids:
             detect_logger().info(
                 'start to stop jobs: {}'.format(stop_job_ids))
         stop_jobs = set()
         for job_id in stop_job_ids:
             jobs = JobSaver.query_job(job_id=job_id)
             if jobs:
                 stop_jobs.add(jobs[0])
         cls.request_stop_jobs(jobs=stop_jobs,
                               stop_msg="task executor process abort",
                               stop_status=JobStatus.FAILED)
     except Exception as e:
         detect_logger().exception(e)
     finally:
         detect_logger().info(f"finish detect {count} running task")
Ejemplo n.º 5
0
 def collect_task_of_all_party(cls, job, initiator_task, set_status=None):
     tasks_on_all_party = JobSaver.query_task(
         task_id=initiator_task.f_task_id,
         task_version=initiator_task.f_task_version)
     tasks_status_on_all = set(
         [task.f_status for task in tasks_on_all_party])
     if not len(tasks_status_on_all
                ) > 1 and not TaskStatus.RUNNING in tasks_status_on_all:
         return
     status, federated_response = FederatedScheduler.collect_task(
         job=job, task=initiator_task)
     if status != FederatedSchedulingStatusCode.SUCCESS:
         schedule_logger(job_id=job.f_job_id).warning(
             f"collect task {initiator_task.f_task_id} {initiator_task.f_task_version} on {initiator_task.f_role} {initiator_task.f_party_id} failed"
         )
     for _role in federated_response.keys():
         for _party_id, party_response in federated_response[_role].items():
             if party_response["retcode"] == RetCode.SUCCESS:
                 JobSaver.update_task_status(
                     task_info=party_response["data"])
                 JobSaver.update_task(task_info=party_response["data"])
             elif party_response[
                     "retcode"] == RetCode.FEDERATED_ERROR and set_status:
                 tmp_task_info = {
                     "job_id": initiator_task.f_job_id,
                     "task_id": initiator_task.f_task_id,
                     "task_version": initiator_task.f_task_version,
                     "role": _role,
                     "party_id": _party_id,
                     "party_status": TaskStatus.RUNNING
                 }
                 JobSaver.update_task_status(task_info=tmp_task_info)
                 tmp_task_info["party_status"] = set_status
                 JobSaver.update_task_status(task_info=tmp_task_info)
Ejemplo n.º 6
0
 def update_job_on_initiator(cls, initiator_job: Job, update_fields: list):
     jobs = JobSaver.query_job(job_id=initiator_job.f_job_id)
     if not jobs:
         raise Exception("Failed to update job status on initiator")
     job_info = initiator_job.to_human_model_dict(
         only_primary_with=update_fields)
     for field in update_fields:
         job_info[field] = getattr(initiator_job, "f_%s" % field)
     for job in jobs:
         job_info["role"] = job.f_role
         job_info["party_id"] = job.f_party_id
         JobSaver.update_job_status(job_info=job_info)
         JobSaver.update_job(job_info=job_info)
Ejemplo n.º 7
0
def report_task(job_id, component_name, task_id, task_version, role, party_id):
    task_info = {}
    task_info.update(request.json)
    task_info.update({
        "job_id": job_id,
        "task_id": task_id,
        "task_version": task_version,
        "role": role,
        "party_id": party_id,
    })
    JobSaver.update_task(task_info=task_info)
    if task_info.get("party_status"):
        JobSaver.update_status(Task, task_info)
    return get_json_result(retcode=0, retmsg='success')
Ejemplo n.º 8
0
def upload_history():
    request_data = request.json
    if request_data.get('job_id'):
        tasks = JobSaver.query_task(component_name='upload_0', status=StatusSet.SUCCESS, job_id=request_data.get('job_id'), run_on_this_party=True)
    else:
        tasks = JobSaver.query_task(component_name='upload_0', status=StatusSet.SUCCESS, run_on_this_party=True)
    limit = request_data.get('limit')
    if not limit:
        tasks = tasks[-1::-1]
    else:
        tasks = tasks[-1:-limit - 1:-1]
    jobs_run_conf = job_utils.get_job_configuration(None, None, None, tasks)
    data = get_upload_info(jobs_run_conf=jobs_run_conf)
    return get_json_result(retcode=0, retmsg='success', data=data)
Ejemplo n.º 9
0
 def create_task(cls, role, party_id, run_on_this_party, task_info):
     task_info["role"] = role
     task_info["party_id"] = party_id
     task_info["status"] = TaskStatus.WAITING
     task_info["party_status"] = TaskStatus.WAITING
     task_info["create_time"] = base_utils.current_timestamp()
     task_info["run_on_this_party"] = run_on_this_party
     if "task_id" not in task_info:
         task_info["task_id"] = job_utils.generate_task_id(
             job_id=task_info["job_id"],
             component_name=task_info["component_name"])
     if "task_version" not in task_info:
         task_info["task_version"] = 0
     JobSaver.create_task(task_info=task_info)
Ejemplo n.º 10
0
 def stop_jobs(cls, job_id, stop_status, role=None, party_id=None):
     if role and party_id:
         jobs = JobSaver.query_job(job_id=job_id,
                                   role=role,
                                   party_id=party_id)
     else:
         jobs = JobSaver.query_job(job_id=job_id)
     kill_status = True
     kill_details = {}
     for job in jobs:
         kill_job_status, kill_job_details = cls.stop_job(
             job=job, stop_status=stop_status)
         kill_status = kill_status & kill_job_status
         kill_details[job_id] = kill_job_details
     return kill_status, kill_details
Ejemplo n.º 11
0
    def status_reload(cls, job, source_tasks, target_tasks):
        schedule_logger(job.f_job_id).info("start reload status")
        # update task status
        for key, source_task in source_tasks.items():
            JobSaver.reload_task(source_task, target_tasks[key])

        # update job status
        JobSaver.update_job(
            job_info={
                "job_id": job.f_job_id,
                "role": job.f_role,
                "party_id": job.f_party_id,
                "inheritance_status": JobInheritanceStatus.SUCCESS
            })
        schedule_logger(job.f_job_id).info("reload status success")
Ejemplo n.º 12
0
 def start_job(cls, job_id, initiator_role, initiator_party_id):
     schedule_logger(job_id=job_id).info(
         "try to start job {} on initiator {} {}".format(
             job_id, initiator_role, initiator_party_id))
     job_info = {}
     job_info["job_id"] = job_id
     job_info["role"] = initiator_role
     job_info["party_id"] = initiator_party_id
     job_info["status"] = JobStatus.RUNNING
     job_info["party_status"] = JobStatus.RUNNING
     job_info["start_time"] = current_timestamp()
     job_info["tag"] = 'end_waiting'
     jobs = JobSaver.query_job(job_id=job_id,
                               role=initiator_role,
                               party_id=initiator_party_id)
     if jobs:
         job = jobs[0]
         FederatedScheduler.start_job(job=job)
         schedule_logger(job_id=job_id).info(
             "start job {} on initiator {} {}".format(
                 job_id, initiator_role, initiator_party_id))
     else:
         schedule_logger(job_id=job_id).error(
             "can not found job {} on initiator {} {}".format(
                 job_id, initiator_role, initiator_party_id))
Ejemplo n.º 13
0
 def _run(self):
     job = JobSaver.query_job(job_id=self.args.job_id,
                              role=self.args.role,
                              party_id=self.args.party_id)[0]
     try:
         JobController.job_reload(job)
     except Exception as e:
         traceback.print_exc()
         JobSaver.update_job(
             job_info={
                 "job_id": job.f_job_id,
                 "role": job.f_role,
                 "party_id": job.f_party_id,
                 "inheritance_status": JobInheritanceStatus.FAILED
             })
         LOGGER.exception(e)
Ejemplo n.º 14
0
 def start_task(cls, job, task):
     schedule_logger(task.f_job_id).info(
         "try to start task {} {} on {} {}".format(task.f_task_id,
                                                   task.f_task_version,
                                                   task.f_role,
                                                   task.f_party_id))
     apply_status = ResourceManager.apply_for_task_resource(
         task_info=task.to_human_model_dict(only_primary_with=["status"]))
     if not apply_status:
         return SchedulingStatusCode.NO_RESOURCE
     task.f_status = TaskStatus.RUNNING
     update_status = JobSaver.update_task_status(
         task_info=task.to_human_model_dict(only_primary_with=["status"]))
     if not update_status:
         # Another scheduler scheduling the task
         schedule_logger(task.f_job_id).info(
             "task {} {} start on another scheduler".format(
                 task.f_task_id, task.f_task_version))
         # Rollback
         task.f_status = TaskStatus.WAITING
         ResourceManager.return_task_resource(
             task_info=task.to_human_model_dict(
                 only_primary_with=["status"]))
         return SchedulingStatusCode.PASS
     schedule_logger(task.f_job_id).info("start task {} {} on {} {}".format(
         task.f_task_id, task.f_task_version, task.f_role, task.f_party_id))
     FederatedScheduler.sync_task_status(job=job, task=task)
     status_code, response = FederatedScheduler.start_task(job=job,
                                                           task=task)
     if status_code == FederatedSchedulingStatusCode.SUCCESS:
         return SchedulingStatusCode.SUCCESS
     else:
         return SchedulingStatusCode.FAILED
Ejemplo n.º 15
0
def component_output_data_download():
    request_data = request.json
    tasks = JobSaver.query_task(only_latest=True,
                                job_id=request_data['job_id'],
                                component_name=request_data['component_name'],
                                role=request_data['role'],
                                party_id=request_data['party_id'])
    if not tasks:
        raise ValueError(
            f'no found task, please check if the parameters are correct:{request_data}'
        )
    import_component_output_depend(tasks[0].f_provider_info)
    try:
        output_tables_meta = get_component_output_tables_meta(
            task_data=request_data)
    except Exception as e:
        stat_logger.exception(e)
        return error_response(210, str(e))
    limit = request_data.get('limit', -1)
    if not output_tables_meta:
        return error_response(response_code=210, retmsg='no data')
    if limit == 0:
        return error_response(response_code=210, retmsg='limit is 0')
    tar_file_name = 'job_{}_{}_{}_{}_output_data.tar.gz'.format(
        request_data['job_id'], request_data['component_name'],
        request_data['role'], request_data['party_id'])
    return TableStorage.send_table(output_tables_meta,
                                   tar_file_name,
                                   limit=limit,
                                   need_head=request_data.get("head", True))
Ejemplo n.º 16
0
 def update_job_status(cls, job_info):
     update_status = JobSaver.update_job_status(job_info=job_info)
     if update_status and EndStatus.contains(job_info.get("status")):
         ResourceManager.return_job_resource(job_id=job_info["job_id"],
                                             role=job_info["role"],
                                             party_id=job_info["party_id"])
     return update_status
Ejemplo n.º 17
0
def stop_job():
    job_id = request.json.get('job_id')
    stop_status = request.json.get("stop_status", "canceled")
    jobs = JobSaver.query_job(job_id=job_id)
    if jobs:
        schedule_logger(job_id).info(f"stop job on this party")
        kill_status, kill_details = JobController.stop_jobs(
            job_id=job_id, stop_status=stop_status)
        schedule_logger(job_id).info(
            f"stop job on this party status {kill_status}")
        schedule_logger(job_id).info(
            f"request stop job {jobs[0]} to {stop_status}")
        status_code, response = FederatedScheduler.request_stop_job(
            job=jobs[0],
            stop_status=stop_status,
            command_body=jobs[0].to_json())
        if status_code == FederatedSchedulingStatusCode.SUCCESS:
            return get_json_result(
                retcode=RetCode.SUCCESS,
                retmsg=f"stop job on this party {kill_status};\n"
                f"stop job on all party success")
        else:
            return get_json_result(retcode=RetCode.OPERATING_ERROR,
                                   retmsg="stop job on this party {};\n"
                                   "stop job failed:\n{}".format(
                                       kill_status,
                                       json_dumps(response, indent=4)))
    else:
        schedule_logger(job_id).info(f"can not found job {job_id} to stop")
        return get_json_result(retcode=RetCode.DATA_ERROR,
                               retmsg="can not found job")
Ejemplo n.º 18
0
 def update_job(cls, job_info):
     """
     Save to local database
     :param job_info:
     :return:
     """
     return JobSaver.update_job(job_info=job_info)
Ejemplo n.º 19
0
 def clean_task(cls, job_id, task_id, task_version, role, party_id,
                content_type: TaskCleanResourceType):
     status = set()
     if content_type == TaskCleanResourceType.METRICS:
         tracker = Tracker(job_id=job_id,
                           role=role,
                           party_id=party_id,
                           task_id=task_id,
                           task_version=task_version)
         status.add(tracker.clean_metrics())
     elif content_type == TaskCleanResourceType.TABLE:
         jobs = JobSaver.query_job(job_id=job_id,
                                   role=role,
                                   party_id=party_id)
         if jobs:
             job = jobs[0]
             job_parameters = RunParameters(
                 **job.f_runtime_conf_on_party["job_parameters"])
             tracker = Tracker(job_id=job_id,
                               role=role,
                               party_id=party_id,
                               task_id=task_id,
                               task_version=task_version,
                               job_parameters=job_parameters)
             status.add(tracker.clean_task(job.f_runtime_conf_on_party))
     if len(status) == 1 and True in status:
         return True
     else:
         return False
Ejemplo n.º 20
0
def query_task():
    tasks = JobSaver.query_task(**request.json)
    if not tasks:
        return get_json_result(retcode=101, retmsg='find task failed')
    return get_json_result(retcode=0,
                           retmsg='success',
                           data=[task.to_json() for task in tasks])
Ejemplo n.º 21
0
def update_job():
    job_info = request.json
    jobs = JobSaver.query_job(job_id=job_info['job_id'],
                              party_id=job_info['party_id'],
                              role=job_info['role'])
    if not jobs:
        return get_json_result(retcode=101, retmsg='find job failed')
    else:
        JobSaver.update_job(
            job_info={
                'description': job_info.get('notes', ''),
                'job_id': job_info['job_id'],
                'role': job_info['role'],
                'party_id': job_info['party_id']
            })
        return get_json_result(retcode=0, retmsg='success')
Ejemplo n.º 22
0
 def report_task_to_initiator(cls, task_info):
     tasks = JobSaver.query_task(task_id=task_info["task_id"],
                                 task_version=task_info["task_version"],
                                 role=task_info["role"],
                                 party_id=task_info["party_id"])
     if tasks[
             0].f_federated_status_collect_type == FederatedCommunicationType.PUSH:
         FederatedScheduler.report_task_to_initiator(task=tasks[0])
Ejemplo n.º 23
0
def get_job_table_list():
    jobs = JobSaver.query_job(**request.json)
    if jobs:
        job = jobs[0]
        tables = get_job_all_table(job)
        return get_json_result(data=tables)
    else:
        return get_json_result(retcode=101, retmsg='no find job')
Ejemplo n.º 24
0
def clean_queue():
    jobs = JobSaver.query_job(is_initiator=True, status=JobStatus.WAITING)
    clean_status = {}
    for job in jobs:
        status_code, response = FederatedScheduler.request_stop_job(
            job=job, stop_status=JobStatus.CANCELED)
        clean_status[job.f_job_id] = status_code
    return get_json_result(retcode=0, retmsg='success', data=clean_status)
Ejemplo n.º 25
0
def query_job():
    jobs = JobSaver.query_job(**request.json)
    if not jobs:
        return get_json_result(retcode=0,
                               retmsg='no job could be found',
                               data=[])
    return get_json_result(retcode=0,
                           retmsg='success',
                           data=[job.to_json() for job in jobs])
Ejemplo n.º 26
0
def get_job_table_list():
    detect_utils.check_config(config=request.json, required_arguments=['job_id', 'role', 'party_id'])
    jobs = JobSaver.query_job(**request.json)
    if jobs:
        job = jobs[0]
        tables = get_job_all_table(job)
        return get_json_result(data=tables)
    else:
        return get_json_result(retcode=101, retmsg='no find job')
Ejemplo n.º 27
0
def component_output_data_table():
    request_data = request.json
    detect_utils.check_config(config=request_data, required_arguments=['job_id', 'role', 'party_id', 'component_name'])
    jobs = JobSaver.query_job(job_id=request_data.get('job_id'))
    if jobs:
        job = jobs[0]
        return jsonify(FederatedScheduler.tracker_command(job, request_data, 'output/table'))
    else:
        return get_json_result(retcode=100, retmsg='No found job')
Ejemplo n.º 28
0
def component_output_data_table():
    request_data = request.json
    jobs = JobSaver.query_job(job_id=request_data.get('job_id'))
    if jobs:
        job = jobs[0]
        return jsonify(
            FederatedScheduler.tracker_command(job, request_data,
                                               'output/table'))
    else:
        return get_json_result(retcode=100, retmsg='No found job')
Ejemplo n.º 29
0
 def load_tasks(cls, component_list, job_id, role, party_id):
     tasks = JobSaver.query_task(job_id=job_id,
                                 role=role,
                                 party_id=party_id,
                                 only_latest=True)
     task_dict = {}
     for cpn in component_list:
         for task in tasks:
             if cpn == task.f_component_name:
                 task_dict[cpn] = task
     return task_dict
Ejemplo n.º 30
0
def check_dependence(job_id, role, party_id):
    job = JobSaver.query_job(job_id=job_id, role=role, party_id=party_id)[0]
    status = DependenceManager.check_job_dependence(job)
    if status:
        return get_json_result(retcode=0, retmsg='success')
    else:
        return get_json_result(
            retcode=RetCode.RUNNING,
            retmsg=f"check for job {job_id} dependence failed, "
            f"dependencies are being installed automatically, it may take a few minutes"
        )