Exemple #1
0
def update_job_progress(job_id, dag, current_task_id):
    component_count = len(dag.get_dependency()['component_list'])
    success_count = success_task_count(job_id=job_id)
    job = Job()
    job.f_progress = float(success_count) / component_count * 100
    job.f_update_time = current_timestamp()
    job.f_current_tasks = json_dumps([current_task_id])
    return job
    def schedule_running_job(cls, job: Job, force_sync_status=False):
        schedule_logger(job.f_job_id).info(f"scheduling running job")

        dsl_parser = schedule_utils.get_job_dsl_parser(
            dsl=job.f_dsl,
            runtime_conf=job.f_runtime_conf_on_party,
            train_runtime_conf=job.f_train_runtime_conf)
        task_scheduling_status_code, auto_rerun_tasks, tasks = TaskScheduler.schedule(
            job=job, dsl_parser=dsl_parser, canceled=job.f_cancel_signal)
        tasks_status = dict([(task.f_component_name, task.f_status)
                             for task in tasks])
        new_job_status = cls.calculate_job_status(
            task_scheduling_status_code=task_scheduling_status_code,
            tasks_status=tasks_status.values())
        if new_job_status == JobStatus.WAITING and job.f_cancel_signal:
            new_job_status = JobStatus.CANCELED
        total, finished_count = cls.calculate_job_progress(
            tasks_status=tasks_status)
        new_progress = float(finished_count) / total * 100
        schedule_logger(job.f_job_id).info(
            f"job status is {new_job_status}, calculate by task status list: {tasks_status}"
        )
        if new_job_status != job.f_status or new_progress != job.f_progress:
            # Make sure to update separately, because these two fields update with anti-weight logic
            if int(new_progress) - job.f_progress > 0:
                job.f_progress = new_progress
                FederatedScheduler.sync_job(job=job,
                                            update_fields=["progress"])
                cls.update_job_on_initiator(initiator_job=job,
                                            update_fields=["progress"])
            if new_job_status != job.f_status:
                job.f_status = new_job_status
                if EndStatus.contains(job.f_status):
                    FederatedScheduler.save_pipelined_model(job=job)
                FederatedScheduler.sync_job_status(job=job)
                cls.update_job_on_initiator(initiator_job=job,
                                            update_fields=["status"])
        if EndStatus.contains(job.f_status):
            cls.finish(job=job, end_status=job.f_status)
        if auto_rerun_tasks:
            schedule_logger(job.f_job_id).info("job have auto rerun tasks")
            cls.set_job_rerun(job_id=job.f_job_id,
                              initiator_role=job.f_initiator_role,
                              initiator_party_id=job.f_initiator_party_id,
                              tasks=auto_rerun_tasks,
                              auto=True)
        if force_sync_status:
            FederatedScheduler.sync_job_status(job=job)
        schedule_logger(job.f_job_id).info("finish scheduling running job")
Exemple #3
0
    def resource_for_task(cls, task_info, operation_type):
        cores_per_task, memory_per_task = cls.calculate_task_resource(
            task_info=task_info)

        if cores_per_task or memory_per_task:
            filters, updates = cls.update_resource_sql(
                resource_model=Job,
                cores=cores_per_task,
                memory=memory_per_task,
                operation_type=operation_type,
            )
            filters.append(Job.f_job_id == task_info["job_id"])
            filters.append(Job.f_role == task_info["role"])
            filters.append(Job.f_party_id == task_info["party_id"])
            filters.append(Job.f_resource_in_use == True)
            operate = Job.update(updates).where(*filters)
            operate_status = operate.execute() > 0
        else:
            operate_status = True
        if operate_status:
            schedule_logger(job_id=task_info["job_id"]).info(
                "task {} {} {} resource successfully".format(
                    task_info["task_id"], task_info["task_version"],
                    operation_type))
        else:
            schedule_logger(job_id=task_info["job_id"]).warning(
                "task {} {} {} resource failed".format(
                    task_info["task_id"], task_info["task_version"],
                    operation_type))
        return operate_status
Exemple #4
0
def get_job_parameters(job_id, role, party_id):
    jobs = Job.select(Job.f_runtime_conf_on_party).where(
        Job.f_job_id == job_id, Job.f_role == role, Job.f_party_id == party_id)
    if jobs:
        job = jobs[0]
        return job.f_runtime_conf_on_party.get("job_parameters")
    else:
        return {}
Exemple #5
0
def get_job_configuration(job_id, role, party_id) -> JobConfiguration:
    jobs = Job.select(Job.f_dsl, Job.f_runtime_conf, Job.f_train_runtime_conf,
                      Job.f_runtime_conf_on_party).where(
                          Job.f_job_id == job_id, Job.f_role == role,
                          Job.f_party_id == party_id)
    if jobs:
        job = jobs[0]
        return JobConfiguration(**job.to_human_model_dict())
Exemple #6
0
    def resource_for_job(cls, job_id, role, party_id, operation_type):
        operate_status = False
        engine_name, cores, memory = cls.calculate_job_resource(job_id=job_id, role=role, party_id=party_id)
        try:
            with DB.atomic():
                updates = {
                    Job.f_engine_type: EngineType.COMPUTING,
                    Job.f_engine_name: engine_name,
                    Job.f_cores: cores,
                    Job.f_memory: memory,
                }
                filters = [
                    Job.f_job_id == job_id,
                    Job.f_role == role,
                    Job.f_party_id == party_id,
                ]
                if operation_type == ResourceOperation.APPLY:
                    updates[Job.f_remaining_cores] = cores
                    updates[Job.f_remaining_memory] = memory
                    updates[Job.f_resource_in_use] = True
                    updates[Job.f_apply_resource_time] = base_utils.current_timestamp()
                    filters.append(Job.f_resource_in_use == False)
                elif operation_type == ResourceOperation.RETURN:
                    updates[Job.f_resource_in_use] = False
                    updates[Job.f_return_resource_time] = base_utils.current_timestamp()
                    filters.append(Job.f_resource_in_use == True)
                operate = Job.update(updates).where(*filters)
                record_status = operate.execute() > 0
                if not record_status:
                    raise RuntimeError(f"record job {job_id} resource {operation_type} failed on {role} {party_id}")

                filters, updates = cls.update_resource_sql(resource_model=EngineRegistry,
                                                           cores=cores,
                                                           memory=memory,
                                                           operation_type=operation_type,
                                                           )
                filters.append(EngineRegistry.f_engine_type == EngineType.COMPUTING)
                filters.append(EngineRegistry.f_engine_name == engine_name)
                operate = EngineRegistry.update(updates).where(*filters)
                apply_status = operate.execute() > 0
                if not apply_status:
                    raise RuntimeError(
                        f"{operation_type} resource from engine {engine_name} for job {job_id} resource {operation_type} failed on {role} {party_id}")
            operate_status = True
        except Exception as e:
            schedule_logger(job_id=job_id).warning(e)
            schedule_logger(job_id=job_id).warning(
                f"{operation_type} job {job_id} resource(cores {cores} memory {memory}) on {role} {party_id} failed")
            operate_status = False
        finally:
            remaining_cores, remaining_memory = cls.get_remaining_resource(EngineRegistry,
                                                                           [
                                                                               EngineRegistry.f_engine_type == EngineType.COMPUTING,
                                                                               EngineRegistry.f_engine_name == engine_name])
            operate_msg = "successfully" if operate_status else "failed"
            schedule_logger(job_id=job_id).info(
                f"{operation_type} job {job_id} resource(cores {cores} memory {memory}) on {role} {party_id} {operate_msg}, remaining cores: {remaining_cores} remaining memory: {remaining_memory}")
            return operate_status
 def end_scheduling_updates(cls, job_id):
     operate = Job.update({
         Job.f_end_scheduling_updates:
         Job.f_end_scheduling_updates + 1
     }).where(
         Job.f_job_id == job_id, Job.f_end_scheduling_updates <
         JobDefaultConfig.end_status_job_scheduling_updates)
     update_status = operate.execute() > 0
     return update_status
Exemple #8
0
 def rerun_signal(cls, job_id, set_or_reset: bool):
     if set_or_reset is True:
         update_fields = {Job.f_rerun_signal: True, Job.f_cancel_signal: False, Job.f_end_scheduling_updates: 0}
     elif set_or_reset is False:
         update_fields = {Job.f_rerun_signal: False}
     else:
         raise RuntimeError(f"can not support rereun signal {set_or_reset}")
     update_status = Job.update(update_fields).where(Job.f_job_id == job_id).execute() > 0
     return update_status
Exemple #9
0
def get_job_dsl(job_id, role, party_id):
    jobs = Job.select(Job.f_dsl).where(Job.f_job_id == job_id,
                                       Job.f_role == role,
                                       Job.f_party_id == party_id)
    if jobs:
        job = jobs[0]
        return job.f_dsl
    else:
        return {}
Exemple #10
0
 def end_scheduling_updates(cls, job_id):
     operate = Job.update({
         Job.f_end_scheduling_updates:
         Job.f_end_scheduling_updates + 1
     }).where(
         Job.f_job_id == job_id,
         Job.f_end_scheduling_updates < END_STATUS_JOB_SCHEDULING_UPDATES)
     update_status = operate.execute() > 0
     return update_status
Exemple #11
0
def get_job_configuration(job_id, role, party_id, tasks=None):
    if tasks:
        jobs_run_conf = {}
        for task in tasks:
            jobs = Job.select(Job.f_job_id, Job.f_runtime_conf_on_party, Job.f_description).where(Job.f_job_id == task.f_job_id)
            job = jobs[0]
            jobs_run_conf[job.f_job_id] = job.f_runtime_conf_on_party["component_parameters"]["role"]["local"]["0"]["upload_0"]
            jobs_run_conf[job.f_job_id]["notes"] = job.f_description
        return jobs_run_conf
    else:
        jobs = Job.select(Job.f_dsl, Job.f_runtime_conf, Job.f_train_runtime_conf, Job.f_runtime_conf_on_party).where(Job.f_job_id == job_id,
                                                                                                                      Job.f_role == role,
                                                                                                                      Job.f_party_id == party_id)
    if jobs:
        job = jobs[0]
        return job.f_dsl, job.f_runtime_conf, job.f_runtime_conf_on_party, job.f_train_runtime_conf
    else:
        return {}, {}, {}, {}
Exemple #12
0
def get_upload_job_configuration_summary(upload_tasks: typing.List[Task]):
    jobs_run_conf = {}
    for task in upload_tasks:
        jobs = Job.select(
            Job.f_job_id, Job.f_runtime_conf_on_party,
            Job.f_description).where(Job.f_job_id == task.f_job_id)
        job = jobs[0]
        jobs_run_conf[job.f_job_id] = job.f_runtime_conf_on_party[
            "component_parameters"]["role"]["local"]["0"]["upload_0"]
        jobs_run_conf[job.f_job_id]["notes"] = job.f_description
    return jobs_run_conf
Exemple #13
0
def get_job_configuration(job_id, role, party_id):
    with DB.connection_context():
        jobs = Job.select(Job.f_dsl, Job.f_runtime_conf,
                          Job.f_train_runtime_conf).where(
                              Job.f_job_id == job_id, Job.f_role == role,
                              Job.f_party_id == party_id)
        if jobs:
            job = jobs[0]
            return json_loads(job.f_dsl), json_loads(
                job.f_runtime_conf), json_loads(job.f_train_runtime_conf)
        else:
            return {}, {}, {}
Exemple #14
0
def get_job_dsl_parser_by_job_id(job_id):
    jobs = Job.select(Job.f_dsl, Job.f_runtime_conf,
                      Job.f_train_runtime_conf).where(Job.f_job_id == job_id)
    if jobs:
        job = jobs[0]
        job_dsl_parser = get_job_dsl_parser(
            dsl=job.f_dsl,
            runtime_conf=job.f_runtime_conf,
            train_runtime_conf=job.f_train_runtime_conf)
        return job_dsl_parser
    else:
        return None
Exemple #15
0
 def update_job_on_initiator(cls, initiator_job: Job, update_fields: list):
     jobs = JobSaver.query_job(job_id=initiator_job.f_job_id)
     if not jobs:
         raise Exception("Failed to update job status on initiator")
     job_info = initiator_job.to_human_model_dict(only_primary_with=update_fields)
     for field in update_fields:
         job_info[field] = getattr(initiator_job, "f_%s" % field)
     for job in jobs:
         job_info["role"] = job.f_role
         job_info["party_id"] = job.f_party_id
         JobSaver.update_job_status(job_info=job_info)
         JobSaver.update_job(job_info=job_info)
Exemple #16
0
 def ready_signal(cls, job_id, set_or_reset: bool, ready_timeout_ttl=None):
     filters = [Job.f_job_id == job_id]
     if set_or_reset:
         update_fields = {Job.f_ready_signal: True, Job.f_ready_time: current_timestamp()}
         filters.append(Job.f_ready_signal == False)
     else:
         update_fields = {Job.f_ready_signal: False, Job.f_ready_time: None}
         filters.append(Job.f_ready_signal == True)
         if ready_timeout_ttl:
             filters.append(current_timestamp() - Job.f_ready_time > ready_timeout_ttl)
     update_status = Job.update(update_fields).where(*filters).execute() > 0
     return update_status
Exemple #17
0
def check_request_parameters(request_data):
    if 'role' not in request_data and 'party_id' not in request_data:
        jobs = Job.select(Job.f_runtime_conf_on_party).where(Job.f_job_id == request_data.get('job_id', ''),
                                                             Job.f_is_initiator == True)
        if jobs:
            job = jobs[0]
            job_runtime_conf = job.f_runtime_conf_on_party
            job_initiator = job_runtime_conf.get('initiator', {})
            role = job_initiator.get('role', '')
            party_id = job_initiator.get('party_id', 0)
            request_data['role'] = role
            request_data['party_id'] = party_id
Exemple #18
0
def query_job(**kwargs):
    with DB.connection_context():
        filters = []
        for f_n, f_v in kwargs.items():
            attr_name = 'f_%s' % f_n
            if hasattr(Job, attr_name):
                filters.append(operator.attrgetter('f_%s' % f_n)(Job) == f_v)
        if filters:
            jobs = Job.select().where(*filters)
            return [job for job in jobs]
        else:
            # not allow query all job
            return []
Exemple #19
0
def check_request_parameters(request_data):
    with DB.connection_context():
        if 'role' not in request_data and 'party_id' not in request_data:
            jobs = Job.select(Job.f_runtime_conf).where(Job.f_job_id == request_data.get('job_id', ''),
                                                        Job.f_is_initiator == 1)
            if jobs:
                job = jobs[0]
                job_runtime_conf = json_loads(job.f_runtime_conf)
                job_initiator = job_runtime_conf.get('initiator', {})
                role = job_initiator.get('role', '')
                party_id = job_initiator.get('party_id', 0)
                request_data['role'] = role
                request_data['party_id'] = party_id
Exemple #20
0
def get_job_dsl_parser_by_job_id(job_id):
    with DB.connection_context():
        jobs = Job.select(
            Job.f_dsl, Job.f_runtime_conf,
            Job.f_train_runtime_conf).where(Job.f_job_id == job_id)
        if jobs:
            job = jobs[0]
            job_dsl_parser = get_job_dsl_parser(
                dsl=json_loads(job.f_dsl),
                runtime_conf=json_loads(job.f_runtime_conf),
                train_runtime_conf=json_loads(job.f_train_runtime_conf))
            return job_dsl_parser
        else:
            return None
Exemple #21
0
def get_job_configuration(job_id, role, party_id, tasks=None):
    with DB.connection_context():
        if tasks:
            jobs_run_conf = {}
            for task in tasks:
                jobs = Job.select(
                    Job.f_job_id, Job.f_runtime_conf,
                    Job.f_description).where(Job.f_job_id == task.f_job_id)
                job = jobs[0]
                jobs_run_conf[job.f_job_id] = json_loads(
                    job.f_runtime_conf)["role_parameters"]["local"]["upload_0"]
                jobs_run_conf[job.f_job_id]["notes"] = job.f_description
            return jobs_run_conf
        else:
            jobs = Job.select(Job.f_dsl, Job.f_runtime_conf,
                              Job.f_train_runtime_conf).where(
                                  Job.f_job_id == job_id, Job.f_role == role,
                                  Job.f_party_id == party_id)
        if jobs:
            job = jobs[0]
            return json_loads(job.f_dsl), json_loads(
                job.f_runtime_conf), json_loads(job.f_train_runtime_conf)
        else:
            return {}, {}, {}
Exemple #22
0
    def save_machine_learning_model_info(self):
        try:
            record = MLModel.get_or_none(MLModel.f_model_version == self.job_id,
                                         MLModel.f_role == self.role,
                                         MLModel.f_model_id == self.model_id,
                                         MLModel.f_party_id == self.party_id)
            if not record:
                job = Job.get_or_none(Job.f_job_id == self.job_id)
                pipeline = self.pipelined_model.read_pipeline_model()
                if job:
                    job_data = job.to_dict()
                    model_info = {
                        'job_id': job_data.get("f_job_id"),
                        'role': self.role,
                        'party_id': self.party_id,
                        'roles': job_data.get("f_roles"),
                        'model_id': self.model_id,
                        'model_version': self.model_version,
                        'initiator_role': job_data.get('f_initiator_role'),
                        'initiator_party_id': job_data.get('f_initiator_party_id'),
                        'runtime_conf': job_data.get('f_runtime_conf'),
                        'work_mode': job_data.get('f_work_mode'),
                        'train_dsl': job_data.get('f_dsl'),
                        'train_runtime_conf': job_data.get('f_train_runtime_conf'),
                        'size': self.get_model_size(),
                        'job_status': job_data.get('f_status'),
                        'parent': pipeline.parent,
                        'fate_version': pipeline.fate_version,
                        'runtime_conf_on_party': json_loads(pipeline.runtime_conf_on_party),
                        'parent_info': json_loads(pipeline.parent_info),
                        'inference_dsl': json_loads(pipeline.inference_dsl)
                    }
                    model_utils.save_model_info(model_info)

                    schedule_logger(self.job_id).info(
                        'save {} model info done. model id: {}, model version: {}.'.format(self.job_id,
                                                                                           self.model_id,
                                                                                           self.model_version))
                else:
                    schedule_logger(self.job_id).info(
                        'save {} model info failed, no job found in db. '
                        'model id: {}, model version: {}.'.format(self.job_id,
                                                                  self.model_id,
                                                                  self.model_version))
            else:
                schedule_logger(self.job_id).info('model {} info has already existed in database.'.format(self.job_id))
        except Exception as e:
            schedule_logger(self.job_id).exception(e)
Exemple #23
0
 def query_job(cls, reverse=None, order_by=None, **kwargs):
     filters = []
     for f_n, f_v in kwargs.items():
         attr_name = 'f_%s' % f_n
         if hasattr(Job, attr_name):
             filters.append(operator.attrgetter('f_%s' % f_n)(Job) == f_v)
     if filters:
         jobs = Job.select().where(*filters)
         if reverse is not None:
             if not order_by or not hasattr(Job, f"f_{order_by}"):
                 order_by = "create_time"
             if reverse is True:
                 jobs = jobs.order_by(getattr(Job, f"f_{order_by}").desc())
             elif reverse is False:
                 jobs = jobs.order_by(getattr(Job, f"f_{order_by}").asc())
         return [job for job in jobs]
     else:
         # not allow query all job
         return []
Exemple #24
0
    def save_machine_learning_model_info(self):
        try:
            record = MLModel.get_or_none(
                MLModel.f_model_version == self.job_id)
            if not record:
                job = Job.get_or_none(Job.f_job_id == self.job_id)
                if job:
                    job_data = job.to_json()
                    MLModel.create(
                        f_role=self.role,
                        f_party_id=self.party_id,
                        f_roles=job_data.get("f_roles"),
                        f_model_id=self.model_id,
                        f_model_version=self.model_version,
                        f_job_id=job_data.get("f_job_id"),
                        f_create_time=current_timestamp(),
                        f_initiator_role=job_data.get('f_initiator_role'),
                        f_initiator_party_id=job_data.get(
                            'f_initiator_party_id'),
                        f_runtime_conf=job_data.get('f_runtime_conf'),
                        f_work_mode=job_data.get('f_work_mode'),
                        f_dsl=job_data.get('f_dsl'),
                        f_train_runtime_conf=job_data.get(
                            'f_train_runtime_conf'),
                        f_size=self.get_model_size(),
                        f_job_status=job_data.get('f_status'))

                    schedule_logger(self.job_id).info(
                        'save {} model info done. model id: {}, model version: {}.'
                        .format(self.job_id, self.model_id,
                                self.model_version))
                else:
                    schedule_logger(self.job_id).info(
                        'save {} model info failed, no job found in db. '
                        'model id: {}, model version: {}.'.format(
                            self.job_id, self.model_id, self.model_version))
            else:
                schedule_logger(self.job_id).info(
                    'model {} info has already existed in database.'.format(
                        self.job_id))
        except Exception as e:
            schedule_logger(self.job_id).exception(e)
Exemple #25
0
 def detect_resource_record(cls):
     detect_logger().info('start detect resource recycle')
     try:
         filter_status = EndStatus.status_list()
         filter_status.append(JobStatus.WAITING)
         jobs = Job.select().where(
             Job.f_resource_in_use == True,
             current_timestamp() - Job.f_apply_resource_time >
             10 * 60 * 1000, Job.f_status << filter_status)
         stop_jobs = set()
         for job in jobs:
             if job.f_status == JobStatus.WAITING:
                 stop_jobs.add(job)
             else:
                 try:
                     detect_logger(job_id=job.f_job_id).info(
                         f"start to return job {job.f_job_id} on {job.f_role} {job.f_party_id} resource"
                     )
                     flag = ResourceManager.return_job_resource(
                         job_id=job.f_job_id,
                         role=job.f_role,
                         party_id=job.f_party_id)
                     if flag:
                         detect_logger(job_id=job.f_job_id).info(
                             f"return job {job.f_job_id} on {job.f_role} {job.f_party_id} resource successfully"
                         )
                     else:
                         detect_logger(job_id=job.f_job_id).info(
                             f"return job {job.f_job_id} on {job.f_role} {job.f_party_id} resource failed"
                         )
                 except Exception as e:
                     detect_logger(job_id=job.f_job_id).exception(e)
         cls.request_stop_jobs(jobs=stop_jobs,
                               stop_msg="start timeout",
                               stop_status=JobStatus.TIMEOUT)
     except Exception as e:
         detect_logger().exception(e)
     finally:
         detect_logger().info('finish detect resource recycle')
Exemple #26
0
 def query_job(cls, reverse=None, order_by=None, **kwargs):
     filters = []
     for f_n, f_v in kwargs.items():
         attr_name = 'f_%s' % f_n
         if attr_name in ['f_start_time', 'f_end_time', 'f_elapsed'
                          ] and isinstance(f_v, list):
             if attr_name == 'f_elapsed':
                 b_timestamp = f_v[0]
                 e_timestamp = f_v[1]
             else:
                 # time type: %Y-%m-%d %H:%M:%S
                 b_timestamp = str_to_time_stamp(f_v[0]) if isinstance(
                     f_v[0], str) else f_v[0]
                 e_timestamp = str_to_time_stamp(f_v[1]) if isinstance(
                     f_v[1], str) else f_v[1]
             filters.append(
                 getattr(Job, attr_name).between(b_timestamp, e_timestamp))
         elif hasattr(Job, attr_name):
             if isinstance(f_v, set):
                 filters.append(
                     operator.attrgetter('f_%s' % f_n)(Job) << f_v)
             else:
                 filters.append(
                     operator.attrgetter('f_%s' % f_n)(Job) == f_v)
     if filters:
         jobs = Job.select().where(*filters)
         if reverse is not None:
             if not order_by or not hasattr(Job, f"f_{order_by}"):
                 order_by = "create_time"
             if reverse is True:
                 jobs = jobs.order_by(getattr(Job, f"f_{order_by}").desc())
             elif reverse is False:
                 jobs = jobs.order_by(getattr(Job, f"f_{order_by}").asc())
         return [job for job in jobs]
     else:
         return []
Exemple #27
0
 def create_job(cls, job: Job):
     return cls.job_command(job=job,
                            command="create",
                            command_body=job.to_human_model_dict(),
                            order_federated=True)
Exemple #28
0
    def run_job(job_id, initiator_role, initiator_party_id):
        job_dsl, job_runtime_conf, train_runtime_conf = job_utils.get_job_configuration(
            job_id=job_id, role=initiator_role, party_id=initiator_party_id)
        job_parameters = job_runtime_conf.get('job_parameters', {})
        job_initiator = job_runtime_conf.get('initiator', {})
        dag = get_job_dsl_parser(dsl=job_dsl,
                                 runtime_conf=job_runtime_conf,
                                 train_runtime_conf=train_runtime_conf)
        job_args = dag.get_args_input()
        if not job_initiator:
            return False
        storage.init_storage(job_id=job_id, work_mode=RuntimeConfig.WORK_MODE)
        job = Job()
        job.f_job_id = job_id
        job.f_start_time = current_timestamp()
        job.f_status = JobStatus.RUNNING
        job.f_update_time = current_timestamp()
        TaskScheduler.sync_job_status(
            job_id=job_id,
            roles=job_runtime_conf['role'],
            work_mode=job_parameters['work_mode'],
            initiator_party_id=job_initiator['party_id'],
            job_info=job.to_json())

        top_level_task_status = set()
        components = dag.get_next_components(None)
        schedule_logger.info('job {} root components is {}'.format(
            job.f_job_id, [component.get_name() for component in components],
            None))
        for component in components:
            try:
                # run a component as task
                run_status = TaskScheduler.run_component(
                    job_id, job_runtime_conf, job_parameters, job_initiator,
                    job_args, dag, component)
            except Exception as e:
                schedule_logger.info(e)
                run_status = False
            top_level_task_status.add(run_status)
            if not run_status:
                break
        if len(top_level_task_status) == 2:
            job.f_status = JobStatus.PARTIAL
        elif True in top_level_task_status:
            job.f_status = JobStatus.SUCCESS
        else:
            job.f_status = JobStatus.FAILED
        job.f_end_time = current_timestamp()
        job.f_elapsed = job.f_end_time - job.f_start_time
        if job.f_status == JobStatus.SUCCESS:
            job.f_progress = 100
        job.f_update_time = current_timestamp()
        TaskScheduler.sync_job_status(
            job_id=job_id,
            roles=job_runtime_conf['role'],
            work_mode=job_parameters['work_mode'],
            initiator_party_id=job_initiator['party_id'],
            job_info=job.to_json())
        TaskScheduler.finish_job(job_id=job_id,
                                 job_runtime_conf=job_runtime_conf)
        schedule_logger.info('job {} finished, status is {}'.format(
            job.f_job_id, job.f_status))
Exemple #29
0
    def submit_job(job_data):
        job_id = generate_job_id()
        schedule_logger.info('submit job, job_id {}, body {}'.format(job_id, job_data))
        job_dsl = job_data.get('job_dsl', {})
        job_runtime_conf = job_data.get('job_runtime_conf', {})
        job_utils.check_pipeline_job_runtime_conf(job_runtime_conf)
        job_parameters = job_runtime_conf['job_parameters']
        job_initiator = job_runtime_conf['initiator']
        job_type = job_parameters.get('job_type', '')
        if job_type != 'predict':
            # generate job model info
            job_parameters['model_id'] = '#'.join([dtable_utils.all_party_key(job_runtime_conf['role']), 'model'])
            job_parameters['model_version'] = job_id
            train_runtime_conf = {}
        else:
            detect_utils.check_config(job_parameters, ['model_id', 'model_version'])
            # get inference dsl from pipeline model as job dsl
            job_tracker = Tracking(job_id=job_id, role=job_initiator['role'], party_id=job_initiator['party_id'],
                                   model_id=job_parameters['model_id'], model_version=job_parameters['model_version'])
            pipeline_model = job_tracker.get_output_model('pipeline')
            job_dsl = json_loads(pipeline_model['Pipeline'].inference_dsl)
            train_runtime_conf = json_loads(pipeline_model['Pipeline'].train_runtime_conf)
        job_dsl_path, job_runtime_conf_path = save_job_conf(job_id=job_id,
                                                            job_dsl=job_dsl,
                                                            job_runtime_conf=job_runtime_conf)

        job = Job()
        job.f_job_id = job_id
        job.f_roles = json_dumps(job_runtime_conf['role'])
        job.f_work_mode = job_parameters['work_mode']
        job.f_initiator_party_id = job_initiator['party_id']
        job.f_dsl = json_dumps(job_dsl)
        job.f_runtime_conf = json_dumps(job_runtime_conf)
        job.f_train_runtime_conf = json_dumps(train_runtime_conf)
        job.f_run_ip = ''
        job.f_status = JobStatus.WAITING
        job.f_progress = 0
        job.f_create_time = current_timestamp()

        # save job info
        TaskScheduler.distribute_job(job=job, roles=job_runtime_conf['role'], job_initiator=job_initiator)

        # push into queue
        RuntimeConfig.JOB_QUEUE.put_event({
            'job_id': job_id,
            "initiator_role": job_initiator['role'],
            "initiator_party_id": job_initiator['party_id']
        }
        )
        schedule_logger.info(
            'submit job successfully, job id is {}, model id is {}'.format(job.f_job_id, job_parameters['model_id']))
        board_url = BOARD_DASHBOARD_URL.format(job_id, job_initiator['role'], job_initiator['party_id'])
        return job_id, job_dsl_path, job_runtime_conf_path, {'model_id': job_parameters['model_id'],
                                                             'model_version': job_parameters[
                                                                 'model_version']}, board_url
Exemple #30
0
    def run_job(job_id, initiator_role, initiator_party_id):
        job_dsl, job_runtime_conf, train_runtime_conf = job_utils.get_job_configuration(job_id=job_id,
                                                                                        role=initiator_role,
                                                                                        party_id=initiator_party_id)
        job_parameters = job_runtime_conf.get('job_parameters', {})
        job_initiator = job_runtime_conf.get('initiator', {})
        dag = get_job_dsl_parser(dsl=job_dsl,
                                 runtime_conf=job_runtime_conf,
                                 train_runtime_conf=train_runtime_conf)
        job_args = dag.get_args_input()
        if not job_initiator:
            return False
        timeout = job_utils.get_timeout(job_id, job_parameters.get("timeout", None), job_runtime_conf, job_dsl)
        t = Timer(timeout, TaskScheduler.job_handler, [job_id])
        t.start()

        job = Job()
        job.f_job_id = job_id
        job.f_start_time = current_timestamp()
        job.f_status = JobStatus.RUNNING
        job.f_update_time = current_timestamp()
        TaskScheduler.sync_job_status(job_id=job_id, roles=job_runtime_conf['role'],
                                      work_mode=job_parameters['work_mode'],
                                      initiator_party_id=job_initiator['party_id'],
                                      initiator_role=job_initiator['role'],
                                      job_info=job.to_json())

        top_level_task_status = set()
        components = dag.get_next_components(None)
        schedule_logger(job_id).info(
            'job {} root components is {}'.format(job.f_job_id, [component.get_name() for component in components],
                                                  None))
        for component in components:
            try:
                # run a component as task
                run_status = TaskScheduler.run_component(job_id, job_runtime_conf, job_parameters, job_initiator,
                                                         job_args, dag,
                                                         component)
            except Exception as e:
                schedule_logger(job_id).exception(e)
                run_status = False
            top_level_task_status.add(run_status)
            if not run_status:
                break
        if len(top_level_task_status) == 2:
            job.f_status = JobStatus.FAILED
        elif True in top_level_task_status:
            job.f_status = JobStatus.COMPLETE
        else:
            job.f_status = JobStatus.FAILED
        job.f_end_time = current_timestamp()
        job.f_elapsed = job.f_end_time - job.f_start_time
        if job.f_status == JobStatus.COMPLETE:
            job.f_progress = 100
        job.f_update_time = current_timestamp()
        try:
            TaskScheduler.finish_job(job_id=job_id, job_runtime_conf=job_runtime_conf)
        except Exception as e:
            schedule_logger(job_id).exception(e)
            job.f_status = JobStatus.FAILED

        if job.f_status == JobStatus.FAILED:
            TaskScheduler.stop(job_id=job_id, end_status=JobStatus.FAILED)

        try:
            TaskScheduler.sync_job_status(job_id=job_id, roles=job_runtime_conf['role'],
                                          work_mode=job_parameters['work_mode'],
                                          initiator_party_id=job_initiator['party_id'],
                                          initiator_role=job_initiator['role'],
                                          job_info=job.to_json())
        except Exception as e:
            schedule_logger(job_id).exception(e)
            schedule_logger(job_id).warning('job {} sync status failed'.format(job.f_job_id))

        schedule_logger(job_id).info('job {} finished, status is {}'.format(job.f_job_id, job.f_status))
        t.cancel()