Exemple #1
0
    def resource_for_task(cls, task_info, operation_type):
        cores_per_task, memory_per_task = cls.calculate_task_resource(
            task_info=task_info)

        if cores_per_task or memory_per_task:
            filters, updates = cls.update_resource_sql(
                resource_model=Job,
                cores=cores_per_task,
                memory=memory_per_task,
                operation_type=operation_type,
            )
            filters.append(Job.f_job_id == task_info["job_id"])
            filters.append(Job.f_role == task_info["role"])
            filters.append(Job.f_party_id == task_info["party_id"])
            filters.append(Job.f_resource_in_use == True)
            operate = Job.update(updates).where(*filters)
            operate_status = operate.execute() > 0
        else:
            operate_status = True
        if operate_status:
            schedule_logger(job_id=task_info["job_id"]).info(
                "task {} {} {} resource successfully".format(
                    task_info["task_id"], task_info["task_version"],
                    operation_type))
        else:
            schedule_logger(job_id=task_info["job_id"]).warning(
                "task {} {} {} resource failed".format(
                    task_info["task_id"], task_info["task_version"],
                    operation_type))
        return operate_status
Exemple #2
0
def kill_task_executor_process(task: Task, only_child=False):
    try:
        if not task.f_run_pid:
            schedule_logger(task.f_job_id).info("job {} task {} {} {} with {} party status no process pid".format(
                task.f_job_id, task.f_task_id, task.f_role, task.f_party_id, task.f_party_status))
            return KillProcessStatusCode.NOT_FOUND
        pid = int(task.f_run_pid)
        schedule_logger(task.f_job_id).info("try to stop job {} task {} {} {} with {} party status process pid:{}".format(
            task.f_job_id, task.f_task_id, task.f_role, task.f_party_id, task.f_party_status, pid))
        if not check_job_process(pid):
            schedule_logger(task.f_job_id).info("can not found job {} task {} {} {} with {} party status process pid:{}".format(
                task.f_job_id, task.f_task_id, task.f_role, task.f_party_id, task.f_party_status, pid))
            return KillProcessStatusCode.NOT_FOUND
        p = psutil.Process(int(pid))
        if not is_task_executor_process(task=task, process=p):
            schedule_logger(task.f_job_id).warning("this pid {} is not job {} task {} {} {} executor".format(
                pid, task.f_job_id, task.f_task_id, task.f_role, task.f_party_id))
            return KillProcessStatusCode.ERROR_PID
        for child in p.children(recursive=True):
            if check_job_process(child.pid) and is_task_executor_process(task=task, process=child):
                child.kill()
        if not only_child:
            if check_job_process(p.pid) and is_task_executor_process(task=task, process=p):
                p.kill()
        schedule_logger(task.f_job_id).info("successfully stop job {} task {} {} {} process pid:{}".format(
            task.f_job_id, task.f_task_id, task.f_role, task.f_party_id, pid))
        return KillProcessStatusCode.KILLED
    except Exception as e:
        raise e
Exemple #3
0
 def report_task_to_initiator(cls, task: Task):
     """
     :param task:
     :return:
     """
     if task.f_role != task.f_initiator_role and task.f_party_id != task.f_initiator_party_id:
         exception = None
         for t in range(DEFAULT_FEDERATED_COMMAND_TRYS):
             try:
                 response = federated_api(
                     job_id=task.f_job_id,
                     method='POST',
                     endpoint='/initiator/{}/{}/{}/{}/{}/{}/report'.format(
                         task.f_job_id, task.f_component_name,
                         task.f_task_id, task.f_task_version, task.f_role,
                         task.f_party_id),
                     src_party_id=task.f_party_id,
                     dest_party_id=task.f_initiator_party_id,
                     src_role=task.f_role,
                     json_body=task.to_human_model_dict(
                         only_primary_with=cls.REPORT_TO_INITIATOR_FIELDS),
                     federated_mode=task.f_federated_mode)
             except Exception as e:
                 exception = e
                 continue
             if response["retcode"] != RetCode.SUCCESS:
                 exception = Exception(response["retmsg"])
             else:
                 return True
         else:
             schedule_logger(job_id=task.f_job_id).error(
                 f"report task to initiator error: {exception}")
             return False
     else:
         return False
Exemple #4
0
def local_api(job_id,
              method,
              endpoint,
              json_body,
              api_version=API_VERSION,
              try_times=3):
    endpoint = f"/{api_version}{endpoint}"
    exception = None
    for t in range(try_times):
        try:
            url = "http://{}:{}{}".format(RuntimeConfig.JOB_SERVER_HOST,
                                          RuntimeConfig.HTTP_PORT, endpoint)
            audit_logger(job_id).info('local api request: {}'.format(url))
            action = getattr(requests, method.lower(), None)
            http_response = action(url=url,
                                   data=json_dumps(json_body),
                                   headers=HEADERS)
            audit_logger(job_id).info(http_response.text)
            response = http_response.json()
            audit_logger(job_id).info('local api response: {} {}'.format(
                endpoint, response))
            return response
        except Exception as e:
            schedule_logger(job_id).exception(e)
            exception = e
    else:
        raise Exception('local request error: {}'.format(exception))
Exemple #5
0
def run_subprocess(job_id, config_dir, process_cmd, log_dir=None):
    schedule_logger(job_id=job_id).info('start process command: {}'.format(
        ' '.join(process_cmd)))

    os.makedirs(config_dir, exist_ok=True)
    if log_dir:
        os.makedirs(log_dir, exist_ok=True)
    std_log = open(os.path.join(log_dir if log_dir else config_dir, 'std.log'),
                   'w')
    pid_path = os.path.join(config_dir, 'pid')

    if os.name == 'nt':
        startupinfo = subprocess.STARTUPINFO()
        startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
        startupinfo.wShowWindow = subprocess.SW_HIDE
    else:
        startupinfo = None
    p = subprocess.Popen(process_cmd,
                         stdout=std_log,
                         stderr=std_log,
                         startupinfo=startupinfo)
    with open(pid_path, 'w') as f:
        f.truncate()
        f.write(str(p.pid) + "\n")
        f.flush()
    schedule_logger(job_id=job_id).info(
        'start process command: {} successfully, pid is {}'.format(
            ' '.join(process_cmd), p.pid))
    return p
Exemple #6
0
def start_session_stop(task):
    job_parameters = RunParameters(**get_job_parameters(
        job_id=task.f_job_id, role=task.f_role, party_id=task.f_party_id))
    computing_session_id = generate_session_id(task.f_task_id,
                                               task.f_task_version,
                                               task.f_role, task.f_party_id)
    if task.f_status != TaskStatus.WAITING:
        schedule_logger(task.f_job_id).info(
            f'start run subprocess to stop task session {computing_session_id}'
        )
    else:
        schedule_logger(task.f_job_id).info(
            f'task is waiting, pass stop session {computing_session_id}')
        return
    task_dir = os.path.join(get_job_directory(job_id=task.f_job_id),
                            task.f_role, task.f_party_id,
                            task.f_component_name, 'session_stop')
    os.makedirs(task_dir, exist_ok=True)
    process_cmd = [
        'python3', sys.modules[session_utils.SessionStop.__module__].__file__,
        '-j', computing_session_id, '--computing',
        job_parameters.computing_engine, '--federation',
        job_parameters.federation_engine, '--storage',
        job_parameters.storage_engine, '-c',
        'stop' if task.f_status == JobStatus.SUCCESS else 'kill'
    ]
    p = run_subprocess(job_id=task.f_job_id,
                       config_dir=task_dir,
                       process_cmd=process_cmd,
                       log_dir=None)
Exemple #7
0
 def collect_task_of_all_party(cls, job, initiator_task, set_status=None):
     tasks_on_all_party = JobSaver.query_task(task_id=initiator_task.f_task_id, task_version=initiator_task.f_task_version)
     tasks_status_on_all = set([task.f_status for task in tasks_on_all_party])
     if not len(tasks_status_on_all) > 1 and not TaskStatus.RUNNING in tasks_status_on_all:
         return
     status, federated_response = FederatedScheduler.collect_task(job=job, task=initiator_task)
     if status != FederatedSchedulingStatusCode.SUCCESS:
         schedule_logger(job_id=job.f_job_id).warning(f"collect task {initiator_task.f_task_id} {initiator_task.f_task_version} on {initiator_task.f_role} {initiator_task.f_party_id} failed")
     for _role in federated_response.keys():
         for _party_id, party_response in federated_response[_role].items():
             if party_response["retcode"] == RetCode.SUCCESS:
                 JobSaver.update_task_status(task_info=party_response["data"])
                 JobSaver.update_task(task_info=party_response["data"])
             elif party_response["retcode"] == RetCode.FEDERATED_ERROR and set_status:
                 tmp_task_info = {
                     "job_id": initiator_task.f_job_id,
                     "task_id": initiator_task.f_task_id,
                     "task_version": initiator_task.f_task_version,
                     "role": _role,
                     "party_id": _party_id,
                     "party_status": TaskStatus.RUNNING
                 }
                 JobSaver.update_task_status(task_info=tmp_task_info)
                 tmp_task_info["party_status"] = set_status
                 JobSaver.update_task_status(task_info=tmp_task_info)
Exemple #8
0
 def insert_metrics_into_db(self,
                            metric_namespace: str,
                            metric_name: str,
                            data_type: int,
                            kv,
                            job_level=False):
     try:
         tracking_metric = self.get_dynamic_db_model(
             TrackingMetric, self.job_id)()
         tracking_metric.f_job_id = self.job_id
         tracking_metric.f_component_name = (
             self.component_name
             if not job_level else job_utils.job_virtual_component_name())
         tracking_metric.f_task_id = self.task_id
         tracking_metric.f_task_version = self.task_version
         tracking_metric.f_role = self.role
         tracking_metric.f_party_id = self.party_id
         tracking_metric.f_metric_namespace = metric_namespace
         tracking_metric.f_metric_name = metric_name
         tracking_metric.f_type = data_type
         default_db_source = tracking_metric.to_json()
         tracking_metric_data_source = []
         for k, v in kv:
             db_source = default_db_source.copy()
             db_source['f_key'] = serialize_b64(k)
             db_source['f_value'] = serialize_b64(v)
             db_source['f_create_time'] = current_timestamp()
             tracking_metric_data_source.append(db_source)
         self.bulk_insert_into_db(
             self.get_dynamic_db_model(TrackingMetric, self.job_id),
             tracking_metric_data_source)
     except Exception as e:
         schedule_logger(self.job_id).exception(
             "An exception where inserted metric {} of metric namespace: {} to database:\n{}"
             .format(metric_name, metric_namespace, e))
Exemple #9
0
    def get_output_data_table(self, output_data_infos, tracker_client=None):
        """
        Get component output data table, will run in the task executor process
        :param output_data_infos:
        :return:
        """
        output_tables_meta = {}
        if output_data_infos:
            for output_data_info in output_data_infos:
                schedule_logger(self.job_id).info(
                    "Get task {} {} output table {} {}".format(
                        output_data_info.f_task_id,
                        output_data_info.f_task_version,
                        output_data_info.f_table_namespace,
                        output_data_info.f_table_name))
                if not tracker_client:
                    data_table_meta = storage.StorageTableMeta(
                        name=output_data_info.f_table_name,
                        namespace=output_data_info.f_table_namespace)
                else:
                    data_table_meta = tracker_client.get_table_meta(
                        output_data_info.f_table_name,
                        output_data_info.f_table_namespace)

                output_tables_meta[
                    output_data_info.f_data_name] = data_table_meta
        return output_tables_meta
Exemple #10
0
 def query_output_data_infos(cls, **kwargs):
     try:
         tracking_output_data_info_model = cls.get_dynamic_db_model(
             TrackingOutputDataInfo, kwargs.get("job_id"))
         filters = []
         for f_n, f_v in kwargs.items():
             attr_name = 'f_%s' % f_n
             if hasattr(tracking_output_data_info_model, attr_name):
                 filters.append(
                     operator.attrgetter('f_%s' % f_n)(
                         tracking_output_data_info_model) == f_v)
         if filters:
             output_data_infos_tmp = tracking_output_data_info_model.select(
             ).where(*filters)
         else:
             output_data_infos_tmp = tracking_output_data_info_model.select(
             )
         output_data_infos_group = {}
         # Only the latest version of the task output data is retrieved
         for output_data_info in output_data_infos_tmp:
             group_key = cls.get_output_data_group_key(
                 output_data_info.f_task_id, output_data_info.f_data_name)
             if group_key not in output_data_infos_group:
                 output_data_infos_group[group_key] = output_data_info
             elif output_data_info.f_task_version > output_data_infos_group[
                     group_key].f_task_version:
                 output_data_infos_group[group_key] = output_data_info
         return output_data_infos_group.values()
     except Exception as e:
         schedule_logger(kwargs.get("job_id")).exception(e)
         return []
Exemple #11
0
 def insert_summary_into_db(self, summary_data: dict):
     try:
         summary_model = self.get_dynamic_db_model(ComponentSummary,
                                                   self.job_id)
         DB.create_tables([summary_model])
         summary_obj = summary_model.get_or_none(
             summary_model.f_job_id == self.job_id,
             summary_model.f_component_name == self.component_name,
             summary_model.f_role == self.role,
             summary_model.f_party_id == self.party_id,
             summary_model.f_task_id == self.task_id,
             summary_model.f_task_version == self.task_version)
         if summary_obj:
             summary_obj.f_summary = serialize_b64(summary_data,
                                                   to_str=True)
             summary_obj.f_update_time = current_timestamp()
             summary_obj.save()
         else:
             self.get_dynamic_db_model(
                 ComponentSummary,
                 self.job_id).create(f_job_id=self.job_id,
                                     f_component_name=self.component_name,
                                     f_role=self.role,
                                     f_party_id=self.party_id,
                                     f_task_id=self.task_id,
                                     f_task_version=self.task_version,
                                     f_summary=serialize_b64(summary_data,
                                                             to_str=True),
                                     f_create_time=current_timestamp())
     except Exception as e:
         schedule_logger(self.job_id).exception(
             "An exception where querying summary job id: {} "
             "component name: {} to database:\n{}".format(
                 self.job_id, self.component_name, e))
Exemple #12
0
 def read_metrics_from_db(self,
                          metric_namespace: str,
                          metric_name: str,
                          data_type,
                          job_level=False):
     metrics = []
     try:
         tracking_metric_model = self.get_dynamic_db_model(
             TrackingMetric, self.job_id)
         tracking_metrics = tracking_metric_model.select(
             tracking_metric_model.f_key,
             tracking_metric_model.f_value).where(
                 tracking_metric_model.f_job_id == self.job_id,
                 tracking_metric_model.f_component_name == (
                     self.component_name if not job_level else
                     job_utils.job_virtual_component_name()),
                 tracking_metric_model.f_role == self.role,
                 tracking_metric_model.f_party_id == self.party_id,
                 tracking_metric_model.f_metric_namespace ==
                 metric_namespace,
                 tracking_metric_model.f_metric_name == metric_name,
                 tracking_metric_model.f_type == data_type)
         for tracking_metric in tracking_metrics:
             yield deserialize_b64(tracking_metric.f_key), deserialize_b64(
                 tracking_metric.f_value)
     except Exception as e:
         schedule_logger(self.job_id).exception(e)
         raise e
     return metrics
Exemple #13
0
 def save_pipelined_model(cls, job_id, role, party_id):
     schedule_logger(job_id).info(
         'job {} on {} {} start to save pipeline'.format(
             job_id, role, party_id))
     job_dsl, job_runtime_conf, train_runtime_conf = job_utils.get_job_configuration(
         job_id=job_id, role=role, party_id=party_id)
     job_parameters = job_runtime_conf.get('job_parameters', {})
     model_id = job_parameters['model_id']
     model_version = job_parameters['model_version']
     job_type = job_parameters.get('job_type', '')
     if job_type == 'predict':
         return
     dag = schedule_utils.get_job_dsl_parser(
         dsl=job_dsl,
         runtime_conf=job_runtime_conf,
         train_runtime_conf=train_runtime_conf)
     predict_dsl = dag.get_predict_dsl(role=role)
     pipeline = pipeline_pb2.Pipeline()
     pipeline.inference_dsl = json_dumps(predict_dsl, byte=True)
     pipeline.train_dsl = json_dumps(job_dsl, byte=True)
     pipeline.train_runtime_conf = json_dumps(job_runtime_conf, byte=True)
     pipeline.fate_version = RuntimeConfig.get_env("FATE")
     pipeline.model_id = model_id
     pipeline.model_version = model_version
     tracker = Tracker(job_id=job_id,
                       role=role,
                       party_id=party_id,
                       model_id=model_id,
                       model_version=model_version)
     tracker.save_pipelined_model(pipelined_buffer_object=pipeline)
     if role != 'local':
         tracker.save_machine_learning_model_info()
     schedule_logger(job_id).info(
         'job {} on {} {} save pipeline successfully'.format(
             job_id, role, party_id))
Exemple #14
0
def federated_coordination_on_grpc(job_id, method, host, port, endpoint, src_party_id, src_role, dest_party_id, json_body, api_version=API_VERSION,
                                   overall_timeout=DEFAULT_REMOTE_REQUEST_TIMEOUT, try_times=3):
    endpoint = f"/{api_version}{endpoint}"
    json_body['src_role'] = src_role
    json_body['src_party_id'] = src_party_id
    if CHECK_NODES_IDENTITY:
        get_node_identity(json_body, src_party_id)
    _packet = wrap_grpc_packet(json_body, method, endpoint, src_party_id, dest_party_id, job_id,
                               overall_timeout=overall_timeout)
    _routing_metadata = gen_routing_metadata(src_party_id=src_party_id, dest_party_id=dest_party_id)
    exception = None
    for t in range(try_times):
        try:
            channel, stub = get_command_federation_channel(host, port)
            _return, _call = stub.unaryCall.with_call(_packet, metadata=_routing_metadata, timeout=(overall_timeout/1000))
            audit_logger(job_id).info("grpc api response: {}".format(_return))
            channel.close()
            response = json_loads(_return.body.value)
            return response
        except Exception as e:
            exception = e
            schedule_logger(job_id).warning(f"remote request {endpoint} error, sleep and try again")
            time.sleep(2 * (t+1))
    else:
        tips = 'Please check rollSite and fateflow network connectivity'
        """
        if 'Error received from peer' in str(exception):
            tips = 'Please check if the fate flow server of the other party is started. '
        if 'failed to connect to all addresses' in str(exception):
            tips = 'Please check whether the rollsite service(port: 9370) is started. '
        """
        raise Exception('{}rpc request error: {}'.format(tips, exception))
Exemple #15
0
 def schedule_ready_job(cls, job):
     job_id, initiator_role, initiator_party_id, = job.f_job_id, job.f_initiator_role, job.f_initiator_party_id
     update_status = cls.ready_signal(job_id=job_id,
                                      set_or_reset=False,
                                      ready_timeout_ttl=60 * 1000)
     schedule_logger(job_id).info(
         f"reset job {job_id} ready signal {update_status}")
Exemple #16
0
 def schedule_ready_jobs(cls, event):
     job_id, initiator_role, initiator_party_id, = event.f_job_id, event.f_initiator_role, event.f_initiator_party_id,
     update_status = JobQueue.update_event(job_id=job_id,
                                           initiator_role=initiator_role,
                                           initiator_party_id=initiator_party_id,
                                           job_status=JobStatus.WAITING,
                                           ttl=5*60*1000)
     schedule_logger(job_id).info(f"update job {job_id} ready status to waiting {update_status}")
Exemple #17
0
 def collect_task_of_all_party(cls, job, task):
     status, federated_response = FederatedScheduler.collect_task(job=job, task=task)
     if status != FederatedSchedulingStatusCode.SUCCESS:
         schedule_logger(job_id=job.f_job_id).warning(f"collect task {task.f_task_id} {task.f_task_version} on {task.f_role} {task.f_party_id} failed")
         return
     for _role in federated_response.keys():
         for _party_id, party_response in federated_response[_role].items():
             JobSaver.update_task_status(task_info=party_response["data"])
             JobSaver.update_task(task_info=party_response["data"])
Exemple #18
0
def check_job_is_timeout(job: Job):
    job_parameters = job.f_runtime_conf_on_party["job_parameters"]
    timeout = job_parameters.get("timeout", JOB_DEFAULT_TIMEOUT)
    now_time = current_timestamp()
    running_time = (now_time - job.f_create_time)/1000
    if running_time > timeout:
        schedule_logger(job_id=job.f_job_id).info('job {}  run time {}s timeout'.format(job.f_job_id, running_time))
        return True
    else:
        return False
Exemple #19
0
 def get_remaining_resource(cls, resource_model: typing.Union[EngineRegistry, Job], filters):
     remaining_cores, remaining_memory = None, None
     try:
         objs = resource_model.select(resource_model.f_remaining_cores, resource_model.f_remaining_memory).where(
             *filters)
         if objs:
             remaining_cores, remaining_memory = objs[0].f_remaining_cores, objs[0].f_remaining_memory
     except Exception as e:
         schedule_logger().exception(e)
     finally:
         return remaining_cores, remaining_memory
Exemple #20
0
 def job_command(cls,
                 job,
                 command,
                 command_body=None,
                 dest_only_initiator=False,
                 specific_dest=None,
                 order_federated=False):
     federated_response = {}
     job_parameters = job.f_runtime_conf_on_party["job_parameters"]
     if dest_only_initiator:
         dest_partys = [(job.f_initiator_role, [job.f_initiator_party_id])]
         api_type = "initiator"
     elif specific_dest:
         dest_partys = specific_dest.items()
         api_type = "party"
     else:
         dest_partys = job.f_roles.items()
         api_type = "party"
     if order_federated:
         dest_partys = schedule_utils.federated_order_reset(
             dest_partys,
             scheduler_partys_info=[(job.f_initiator_role,
                                     job.f_initiator_party_id)])
     for dest_role, dest_party_ids in dest_partys:
         federated_response[dest_role] = {}
         for dest_party_id in dest_party_ids:
             try:
                 response = federated_api(
                     job_id=job.f_job_id,
                     method='POST',
                     endpoint='/{}/{}/{}/{}/{}'.format(
                         api_type, job.f_job_id, dest_role, dest_party_id,
                         command),
                     src_party_id=job.f_initiator_party_id,
                     dest_party_id=dest_party_id,
                     src_role=job.f_initiator_role,
                     json_body=command_body if command_body else {},
                     federated_mode=job_parameters["federated_mode"])
                 federated_response[dest_role][dest_party_id] = response
             except Exception as e:
                 schedule_logger(job_id=job.f_job_id).exception(e)
                 federated_response[dest_role][dest_party_id] = {
                     "retcode": RetCode.FEDERATED_ERROR,
                     "retmsg": "Federated schedule error, {}".format(e)
                 }
             if federated_response[dest_role][dest_party_id]["retcode"]:
                 schedule_logger(job_id=job.f_job_id).warning(
                     "an error occurred while {} the job to role {} party {}: \n{}"
                     .format(
                         command, dest_role, dest_party_id,
                         federated_response[dest_role][dest_party_id]
                         ["retmsg"]))
     return cls.return_federated_response(
         federated_response=federated_response)
Exemple #21
0
 def federated_task_status(cls, job_id, task_id, task_version):
     tasks_on_all_party = JobSaver.query_task(task_id=task_id,
                                              task_version=task_version)
     tasks_party_status = [
         task.f_party_status for task in tasks_on_all_party
     ]
     status = cls.calculate_multi_party_task_status(tasks_party_status)
     schedule_logger(job_id=job_id).info(
         "job {} task {} {} status is {}, calculate by task party status list: {}"
         .format(job_id, task_id, task_version, status, tasks_party_status))
     return status
Exemple #22
0
 def save_metric_meta(self,
                      metric_namespace: str,
                      metric_name: str,
                      metric_meta: MetricMeta,
                      job_level: bool = False):
     schedule_logger(self.job_id).info(
         'save job {} component {} on {} {} {} {} metric meta'.format(
             self.job_id, self.component_name, self.role, self.party_id,
             metric_namespace, metric_name))
     self.insert_metrics_into_db(metric_namespace, metric_name, 0,
                                 metric_meta.to_dict().items(), job_level)
Exemple #23
0
 def finish(cls, job, end_status):
     schedule_logger(job_id=job.f_job_id).info(
         "Job {} finished with {}, do something...".format(
             job.f_job_id, end_status))
     cls.stop_job(job_id=job.f_job_id,
                  role=job.f_initiator_role,
                  party_id=job.f_initiator_party_id,
                  stop_status=end_status)
     FederatedScheduler.clean_job(job=job)
     schedule_logger(job_id=job.f_job_id).info(
         "Job {} finished with {}, done".format(job.f_job_id, end_status))
Exemple #24
0
    def save_pipelined_model(cls, job_id, role, party_id):
        schedule_logger(job_id).info(
            'job {} on {} {} start to save pipeline'.format(
                job_id, role, party_id))
        job_dsl, job_runtime_conf, runtime_conf_on_party, train_runtime_conf = job_utils.get_job_configuration(
            job_id=job_id, role=role, party_id=party_id)
        job_parameters = runtime_conf_on_party.get('job_parameters', {})
        if role in job_parameters.get("assistant_role", []):
            return
        model_id = job_parameters['model_id']
        model_version = job_parameters['model_version']
        job_type = job_parameters.get('job_type', '')
        work_mode = job_parameters['work_mode']
        roles = runtime_conf_on_party['role']
        initiator_role = runtime_conf_on_party['initiator']['role']
        initiator_party_id = runtime_conf_on_party['initiator']['party_id']
        if job_type == 'predict':
            return
        dag = schedule_utils.get_job_dsl_parser(
            dsl=job_dsl,
            runtime_conf=job_runtime_conf,
            train_runtime_conf=train_runtime_conf)
        predict_dsl = dag.get_predict_dsl(role=role)
        pipeline = pipeline_pb2.Pipeline()
        pipeline.inference_dsl = json_dumps(predict_dsl, byte=True)
        pipeline.train_dsl = json_dumps(job_dsl, byte=True)
        pipeline.train_runtime_conf = json_dumps(job_runtime_conf, byte=True)
        pipeline.fate_version = RuntimeConfig.get_env("FATE")
        pipeline.model_id = model_id
        pipeline.model_version = model_version

        pipeline.parent = True
        pipeline.loaded_times = 0
        pipeline.roles = json_dumps(roles, byte=True)
        pipeline.work_mode = work_mode
        pipeline.initiator_role = initiator_role
        pipeline.initiator_party_id = initiator_party_id
        pipeline.runtime_conf_on_party = json_dumps(runtime_conf_on_party,
                                                    byte=True)
        pipeline.parent_info = json_dumps({}, byte=True)

        tracker = Tracker(job_id=job_id,
                          role=role,
                          party_id=party_id,
                          model_id=model_id,
                          model_version=model_version)
        tracker.save_pipelined_model(pipelined_buffer_object=pipeline)
        if role != 'local':
            tracker.save_machine_learning_model_info()
        schedule_logger(job_id).info(
            'job {} on {} {} save pipeline successfully'.format(
                job_id, role, party_id))
Exemple #25
0
 def report_task_update_to_driver(cls, task_info):
     """
     Report task update to FATEFlow Server
     :param task_info:
     :return:
     """
     schedule_logger().info("report task {} {} {} {} to driver".format(
         task_info["task_id"],
         task_info["task_version"],
         task_info["role"],
         task_info["party_id"],
     ))
     ControllerClient.report_task(task_info=task_info)
Exemple #26
0
 def get_table_meta(self, table_info):
     schedule_logger(self.job_id).info(f'start get table meta:{table_info}')
     table_meta_dict = storage.StorageTableMeta(
         namespace=table_info.get("namespace"),
         name=table_info.get("table_name"),
         create_address=False).to_dict()
     schedule_logger(
         self.job_id).info(f'get table meta success: {table_meta_dict}')
     table_meta_dict["part_of_data"] = serialize_b64(
         table_meta_dict["part_of_data"], to_str=True)
     table_meta_dict["schema"] = serialize_b64(table_meta_dict["schema"],
                                               to_str=True)
     return table_meta_dict
Exemple #27
0
 def save_metric_data(self,
                      metric_namespace: str,
                      metric_name: str,
                      metrics: List[Metric],
                      job_level=False):
     schedule_logger(self.job_id).info(
         'save job {} component {} on {} {} {} {} metric data'.format(
             self.job_id, self.component_name, self.role, self.party_id,
             metric_namespace, metric_name))
     kv = []
     for metric in metrics:
         kv.append((metric.key, metric.value))
     self.insert_metrics_into_db(metric_namespace, metric_name, 1, kv,
                                 job_level)
Exemple #28
0
 def bulk_insert_into_db(self, model, data_source):
     try:
         try:
             DB.create_tables([model])
         except Exception as e:
             schedule_logger(self.job_id).exception(e)
         batch_size = 50 if RuntimeConfig.USE_LOCAL_DATABASE else 1000
         for i in range(0, len(data_source), batch_size):
             with DB.atomic():
                 model.insert_many(data_source[i:i + batch_size]).execute()
         return len(data_source)
     except Exception as e:
         schedule_logger(self.job_id).exception(e)
         return 0
Exemple #29
0
def stop_job():
    job_id = request.json.get('job_id')
    stop_status = request.json.get("stop_status", "canceled")
    jobs = JobSaver.query_job(job_id=job_id)
    if jobs:
        schedule_logger(job_id).info(f"stop job on this party")
        kill_status, kill_details = JobController.stop_jobs(
            job_id=job_id, stop_status=stop_status)
        schedule_logger(job_id).info(
            f"stop job on this party status {kill_status}")
        schedule_logger(job_id).info(
            f"request stop job {jobs[0]} to {stop_status}")
        status_code, response = FederatedScheduler.request_stop_job(
            job=jobs[0],
            stop_status=stop_status,
            command_body=jobs[0].to_json())
        if status_code == FederatedSchedulingStatusCode.SUCCESS:
            return get_json_result(
                retcode=RetCode.SUCCESS,
                retmsg=f"stop job on this party {kill_status};\n"
                f"stop job on all party success")
        else:
            return get_json_result(retcode=RetCode.OPERATING_ERROR,
                                   retmsg="stop job on this party {};\n"
                                   "stop job failed:\n{}".format(
                                       kill_status,
                                       json_dumps(response, indent=4)))
    else:
        schedule_logger(job_id).info(f"can not found job {job_id} to stop")
        return get_json_result(retcode=RetCode.DATA_ERROR,
                               retmsg="can not found job")
Exemple #30
0
 def update_task(cls, task_info):
     """
     Save to local database and then report to Initiator
     :param task_info:
     :return:
     """
     update_status = False
     try:
         update_status = JobSaver.update_task(task_info=task_info)
         cls.report_task_to_initiator(task_info=task_info)
     except Exception as e:
         schedule_logger(job_id=task_info["job_id"]).exception(e)
     finally:
         return update_status