Ejemplo n.º 1
0
    def __init__(self,
                 job_id: str,
                 role: str,
                 party_id: int,
                 model_id: str = None,
                 model_version: str = None,
                 component_name: str = None,
                 component_module_name: str = None,
                 task_id: str = None,
                 task_version: int = None,
                 job_parameters: RunParameters = None):
        self.job_id = job_id
        self.role = role
        self.party_id = party_id
        self.model_id = model_id
        self.party_model_id = model_utils.gen_party_model_id(model_id=model_id,
                                                             role=role,
                                                             party_id=party_id)
        self.model_version = model_version
        self.pipelined_model = None
        if self.party_model_id and self.model_version:
            self.pipelined_model = pipelined_model.PipelinedModel(
                model_id=self.party_model_id, model_version=self.model_version)

        self.component_name = component_name if component_name else job_utils.job_virtual_component_name(
        )
        self.module_name = component_module_name if component_module_name else job_utils.job_virtual_component_module_name(
        )
        self.task_id = task_id
        self.task_version = task_version
        self.job_parameters = job_parameters
Ejemplo n.º 2
0
 def read_metrics_from_db(self,
                          metric_namespace: str,
                          metric_name: str,
                          data_type,
                          job_level=False):
     metrics = []
     try:
         tracking_metric_model = self.get_dynamic_db_model(
             TrackingMetric, self.job_id)
         tracking_metrics = tracking_metric_model.select(
             tracking_metric_model.f_key,
             tracking_metric_model.f_value).where(
                 tracking_metric_model.f_job_id == self.job_id,
                 tracking_metric_model.f_component_name == (
                     self.component_name if not job_level else
                     job_utils.job_virtual_component_name()),
                 tracking_metric_model.f_role == self.role,
                 tracking_metric_model.f_party_id == self.party_id,
                 tracking_metric_model.f_metric_namespace ==
                 metric_namespace,
                 tracking_metric_model.f_metric_name == metric_name,
                 tracking_metric_model.f_type == data_type)
         for tracking_metric in tracking_metrics:
             yield deserialize_b64(tracking_metric.f_key), deserialize_b64(
                 tracking_metric.f_value)
     except Exception as e:
         schedule_logger(self.job_id).exception(e)
         raise e
     return metrics
Ejemplo n.º 3
0
 def insert_metrics_into_db(self,
                            metric_namespace: str,
                            metric_name: str,
                            data_type: int,
                            kv,
                            job_level=False):
     try:
         tracking_metric = self.get_dynamic_db_model(
             TrackingMetric, self.job_id)()
         tracking_metric.f_job_id = self.job_id
         tracking_metric.f_component_name = (
             self.component_name
             if not job_level else job_utils.job_virtual_component_name())
         tracking_metric.f_task_id = self.task_id
         tracking_metric.f_task_version = self.task_version
         tracking_metric.f_role = self.role
         tracking_metric.f_party_id = self.party_id
         tracking_metric.f_metric_namespace = metric_namespace
         tracking_metric.f_metric_name = metric_name
         tracking_metric.f_type = data_type
         default_db_source = tracking_metric.to_json()
         tracking_metric_data_source = []
         for k, v in kv:
             db_source = default_db_source.copy()
             db_source['f_key'] = serialize_b64(k)
             db_source['f_value'] = serialize_b64(v)
             db_source['f_create_time'] = current_timestamp()
             tracking_metric_data_source.append(db_source)
         self.bulk_insert_into_db(
             self.get_dynamic_db_model(TrackingMetric, self.job_id),
             tracking_metric_data_source)
     except Exception as e:
         schedule_logger(self.job_id).exception(
             "An exception where inserted metric {} of metric namespace: {} to database:\n{}"
             .format(metric_name, metric_namespace, e))
Ejemplo n.º 4
0
 def rerun_job(cls, job_id, initiator_role, initiator_party_id, component_name):
     schedule_logger(job_id=job_id).info(f"try to rerun job {job_id} on initiator {initiator_role} {initiator_party_id}")
     jobs = JobSaver.query_job(job_id=job_id, role=initiator_role, party_id=initiator_party_id)
     if jobs:
         job = jobs[0]
     else:
         raise RuntimeError(f"can not found job {job_id} on initiator {initiator_role} {initiator_party_id}")
     if component_name != job_utils.job_virtual_component_name():
         tasks = JobSaver.query_task(job_id=job_id, role=initiator_role, party_id=initiator_party_id, component_name=component_name)
     else:
         tasks = JobSaver.query_task(job_id=job_id, role=initiator_role, party_id=initiator_party_id)
     job_can_rerun = False
     dsl_parser = schedule_utils.get_job_dsl_parser(dsl=job.f_dsl,
                                                    runtime_conf=job.f_runtime_conf,
                                                    train_runtime_conf=job.f_train_runtime_conf)
     for task in tasks:
         if task.f_status in {TaskStatus.WAITING, TaskStatus.COMPLETE}:
             if task.f_status == TaskStatus.WAITING:
                 job_can_rerun = True
             schedule_logger(job_id=job_id).info(f"task {task.f_task_id} {task.f_task_version} on {task.f_role} {task.f_party_id} is {task.f_status}, pass rerun")
         else:
             # stop old version task
             FederatedScheduler.stop_task(job=job, task=task, stop_status=TaskStatus.CANCELED)
             FederatedScheduler.clean_task(job=job, task=task, content_type="metrics")
             # create new version task
             task.f_task_version = task.f_task_version + 1
             task.f_run_pid = None
             task.f_run_ip = None
             FederatedScheduler.create_task(job=job, task=task)
             # Save the status information of all participants in the initiator for scheduling
             schedule_logger(job_id=job_id).info(f"create task {task.f_task_id} new version {task.f_task_version}")
             for _role, _party_ids in job.f_runtime_conf["role"].items():
                 for _party_id in _party_ids:
                     if _role == initiator_role and _party_id == initiator_party_id:
                         continue
                     JobController.initialize_tasks(job_id, _role, _party_id, False, job.f_runtime_conf["initiator"], RunParameters(**job.f_runtime_conf["job_parameters"]), dsl_parser, component_name=task.f_component_name, task_version=task.f_task_version)
             schedule_logger(job_id=job_id).info(f"create task {task.f_task_id} new version {task.f_task_version} successfully")
             job_can_rerun = True
     if job_can_rerun:
         if EndStatus.contains(job.f_status):
             job.f_status = JobStatus.WAITING
             job.f_end_time = None
             job.f_elapsed = None
             job.f_progress = 0
             schedule_logger(job_id=job_id).info(f"job {job_id} has been finished, set waiting to rerun")
             status, response = FederatedScheduler.sync_job_status(job=job)
             if status == FederatedSchedulingStatusCode.SUCCESS:
                 FederatedScheduler.sync_job(job=job, update_fields=["end_time", "elapsed", "progress"])
                 JobQueue.create_event(job_id=job_id, initiator_role=initiator_role, initiator_party_id=initiator_party_id)
                 schedule_logger(job_id=job_id).info(f"job {job_id} set waiting to rerun successfully")
             else:
                 schedule_logger(job_id=job_id).info(f"job {job_id} set waiting to rerun failed")
         else:
             # status updates may be delayed, and in a very small probability they will be executed after the rerun command
             schedule_logger(job_id=job_id).info(f"job {job_id} status is {job.f_status}, will be run new version waiting task")
     else:
         schedule_logger(job_id=job_id).info(f"job {job_id} no task to rerun")
Ejemplo n.º 5
0
 def rerun_job(cls, job_id, initiator_role, initiator_party_id, component_name):
     schedule_logger(job_id=job_id).info(f"try to rerun job {job_id} on initiator {initiator_role} {initiator_party_id}")
     jobs = JobSaver.query_job(job_id=job_id, role=initiator_role, party_id=initiator_party_id)
     if jobs:
         job = jobs[0]
     else:
         raise RuntimeError(f"can not found job {job_id} on initiator {initiator_role} {initiator_party_id}")
     if component_name != job_utils.job_virtual_component_name():
         tasks = JobSaver.query_task(job_id=job_id, role=initiator_role, party_id=initiator_party_id, component_name=component_name)
     else:
         tasks = JobSaver.query_task(job_id=job_id, role=initiator_role, party_id=initiator_party_id)
     job_can_rerun = False
     dsl_parser = schedule_utils.get_job_dsl_parser(dsl=job.f_dsl,
                                                    runtime_conf=job.f_runtime_conf_on_party,
                                                    train_runtime_conf=job.f_train_runtime_conf)
     for task in tasks:
         if task.f_status in {TaskStatus.WAITING, TaskStatus.SUCCESS}:
             if task.f_status == TaskStatus.WAITING:
                 job_can_rerun = True
             schedule_logger(job_id=job_id).info(f"task {task.f_task_id} {task.f_task_version} on {task.f_role} {task.f_party_id} is {task.f_status}, pass rerun")
         else:
             # stop old version task
             FederatedScheduler.stop_task(job=job, task=task, stop_status=TaskStatus.CANCELED)
             FederatedScheduler.clean_task(job=job, task=task, content_type="metrics")
             # create new version task
             task.f_task_version = task.f_task_version + 1
             task.f_run_pid = None
             task.f_run_ip = None
             FederatedScheduler.create_task(job=job, task=task)
             # Save the status information of all participants in the initiator for scheduling
             schedule_logger(job_id=job_id).info(f"create task {task.f_task_id} new version {task.f_task_version}")
             for _role, _party_ids in job.f_runtime_conf_on_party["role"].items():
                 for _party_id in _party_ids:
                     if _role == initiator_role and _party_id == initiator_party_id:
                         continue
                     JobController.initialize_tasks(job_id, _role, _party_id, False, job.f_initiator_role, job.f_initiator_party_id, RunParameters(**job.f_runtime_conf_on_party["job_parameters"]), dsl_parser, component_name=task.f_component_name, task_version=task.f_task_version)
             schedule_logger(job_id=job_id).info(f"create task {task.f_task_id} new version {task.f_task_version} successfully")
             job_can_rerun = True
     if job_can_rerun:
         schedule_logger(job_id=job_id).info(f"job {job_id} set rerun signal")
         status = cls.rerun_signal(job_id=job_id, set_or_reset=True)
         if status:
             schedule_logger(job_id=job_id).info(f"job {job_id} set rerun signal successfully")
         else:
             schedule_logger(job_id=job_id).info(f"job {job_id} set rerun signal failed")
     else:
         FederatedScheduler.sync_job_status(job=job)
         schedule_logger(job_id=job_id).info(f"job {job_id} no task to rerun")