Ejemplo n.º 1
0
 def upload_spark_dependence(
         cls,
         job,
         upload_details,
         storage_engine=FateDependenceStorageEngine.HDFS.value):
     schedule_logger(
         job.f_job_id).info(f"start upload dependence: {upload_details}")
     for version, type_provider in upload_details.items():
         for dependence_type, provider in type_provider.items():
             storage_meta = {
                 "f_storage_engine": storage_engine,
                 "f_type": dependence_type,
                 "f_version": version,
                 "f_upload_status": True
             }
             schedule_logger(job.f_job_id).info(
                 f"update dependence storage meta:{storage_meta}")
             DependenceRegistry.save_dependencies_storage_meta(
                 storage_meta, status_check=True)
             WorkerManager.start_general_worker(
                 worker_name=WorkerName.DEPENDENCE_UPLOAD,
                 job_id=job.f_job_id,
                 role=job.f_role,
                 party_id=job.f_party_id,
                 provider=provider,
                 dependence_type=dependence_type,
                 callback=cls.record_upload_process,
                 callback_param=["dependence_type", "pid", "provider"])
Ejemplo n.º 2
0
 def insert_metrics_into_db(self,
                            metric_namespace: str,
                            metric_name: str,
                            data_type: int,
                            kv,
                            job_level=False):
     try:
         model_class = self.get_model_class()
         tracking_metric = model_class()
         tracking_metric.f_job_id = self.job_id
         tracking_metric.f_component_name = (
             self.component_name
             if not job_level else job_utils.job_pipeline_component_name())
         tracking_metric.f_task_id = self.task_id
         tracking_metric.f_task_version = self.task_version
         tracking_metric.f_role = self.role
         tracking_metric.f_party_id = self.party_id
         tracking_metric.f_metric_namespace = metric_namespace
         tracking_metric.f_metric_name = metric_name
         tracking_metric.f_type = data_type
         default_db_source = tracking_metric.to_dict()
         tracking_metric_data_source = []
         for k, v in kv:
             db_source = default_db_source.copy()
             db_source['f_key'] = serialize_b64(k)
             db_source['f_value'] = serialize_b64(v)
             db_source['f_create_time'] = current_timestamp()
             tracking_metric_data_source.append(db_source)
         db_utils.bulk_insert_into_db(model_class,
                                      tracking_metric_data_source,
                                      schedule_logger(self.job_id))
     except Exception as e:
         schedule_logger(self.job_id).exception(
             "An exception where inserted metric {} of metric namespace: {} to database:\n{}"
             .format(metric_name, metric_namespace, e))
Ejemplo n.º 3
0
def is_task_executor_process(task: Task, process: psutil.Process):
    """
    check the process if task executor or not by command
    :param task:
    :param process:
    :return:
    """
    try:
        cmdline = process.cmdline()
    except Exception as e:
        # Not sure whether the process is a task executor process, operations processing is required
        schedule_logger(task.f_job_id).warning(e)
        return False
    else:
        schedule_logger(task.f_job_id).info(cmdline)

    if task.f_worker_id and task.f_worker_id in cmdline:
        return True

    if len(cmdline) != len(task.f_cmd):
        return False

    for i, v in enumerate(task.f_cmd):
        if cmdline[i] != str(v):
            return False

    return True
Ejemplo n.º 4
0
 def report_task_to_initiator(cls, task: Task):
     """
     :param task:
     :return:
     """
     if task.f_role != task.f_initiator_role and task.f_party_id != task.f_initiator_party_id:
         try:
             response = federated_api(
                 job_id=task.f_job_id,
                 method='POST',
                 endpoint='/initiator/{}/{}/{}/{}/{}/{}/report'.format(
                     task.f_job_id, task.f_component_name, task.f_task_id,
                     task.f_task_version, task.f_role, task.f_party_id),
                 src_party_id=task.f_party_id,
                 dest_party_id=task.f_initiator_party_id,
                 src_role=task.f_role,
                 json_body=task.to_human_model_dict(
                     only_primary_with=cls.REPORT_TO_INITIATOR_FIELDS),
                 federated_mode=task.f_federated_mode)
         except Exception as e:
             schedule_logger(task.f_job_id).error(
                 f"report task to initiator error: {e}")
             return False
         if response["retcode"] != RetCode.SUCCESS:
             retmsg = response["retmsg"]
             schedule_logger(task.f_job_id).error(
                 f"report task to initiator error: {retmsg}")
             return False
         else:
             return True
     else:
         return False
Ejemplo n.º 5
0
def start_session_stop(task):
    job_parameters = RunParameters(**get_job_parameters(
        job_id=task.f_job_id, role=task.f_role, party_id=task.f_party_id))
    session_manager_id = generate_session_id(task.f_task_id,
                                             task.f_task_version, task.f_role,
                                             task.f_party_id)
    if task.f_status != TaskStatus.WAITING:
        schedule_logger(task.f_job_id).info(
            f'start run subprocess to stop task sessions {session_manager_id}')
    else:
        schedule_logger(task.f_job_id).info(
            f'task is waiting, pass stop sessions {session_manager_id}')
        return
    task_dir = os.path.join(get_job_directory(job_id=task.f_job_id),
                            task.f_role, task.f_party_id,
                            task.f_component_name, 'session_stop')
    os.makedirs(task_dir, exist_ok=True)
    process_cmd = [
        sys.executable or 'python3',
        sys.modules[session_utils.SessionStop.__module__].__file__,
        '--session', session_manager_id, '--computing',
        job_parameters.computing_engine, '--federation',
        job_parameters.federation_engine, '--storage',
        job_parameters.storage_engine, '-c',
        'stop' if task.f_status == JobStatus.SUCCESS else 'kill'
    ]
    p = process_utils.run_subprocess(job_id=task.f_job_id,
                                     config_dir=task_dir,
                                     process_cmd=process_cmd)
    p.wait()
    p.poll()
Ejemplo n.º 6
0
 def kill_task_all_workers(cls, task: Task):
     schedule_logger(task.f_job_id).info(
         start_log("kill all workers", task=task))
     workers_info = WorkerInfo.query(task_id=task.f_task_id,
                                     task_version=task.f_task_version,
                                     role=task.f_role,
                                     party_id=task.f_party_id)
     for worker_info in workers_info:
         schedule_logger(task.f_job_id).info(
             start_log(
                 f"kill {worker_info.f_worker_name}({worker_info.f_run_pid})",
                 task=task))
         try:
             cls.kill_worker(worker_info)
             schedule_logger(task.f_job_id).info(
                 successful_log(
                     f"kill {worker_info.f_worker_name}({worker_info.f_run_pid})",
                     task=task))
         except Exception as e:
             schedule_logger(task.f_job_id).warning(failed_log(
                 f"kill {worker_info.f_worker_name}({worker_info.f_run_pid})",
                 task=task),
                                                    exc_info=True)
     schedule_logger(task.f_job_id).info(
         successful_log("kill all workers", task=task))
Ejemplo n.º 7
0
 def collect_task_of_all_party(cls, job, initiator_task, set_status=None):
     tasks_on_all_party = JobSaver.query_task(
         task_id=initiator_task.f_task_id,
         task_version=initiator_task.f_task_version)
     tasks_status_on_all = set(
         [task.f_status for task in tasks_on_all_party])
     if not len(tasks_status_on_all
                ) > 1 and TaskStatus.RUNNING not in tasks_status_on_all:
         return
     status, federated_response = FederatedScheduler.collect_task(
         job=job, task=initiator_task)
     if status != FederatedSchedulingStatusCode.SUCCESS:
         schedule_logger(job.f_job_id).warning(
             f"collect task {initiator_task.f_task_id} {initiator_task.f_task_version} on {initiator_task.f_role} {initiator_task.f_party_id} failed"
         )
     for _role in federated_response.keys():
         for _party_id, party_response in federated_response[_role].items():
             if party_response["retcode"] == RetCode.SUCCESS:
                 JobSaver.update_task_status(
                     task_info=party_response["data"])
                 JobSaver.update_task(task_info=party_response["data"])
             elif party_response[
                     "retcode"] == RetCode.FEDERATED_ERROR and set_status:
                 tmp_task_info = {
                     "job_id": initiator_task.f_job_id,
                     "task_id": initiator_task.f_task_id,
                     "task_version": initiator_task.f_task_version,
                     "role": _role,
                     "party_id": _party_id,
                     "party_status": TaskStatus.RUNNING
                 }
                 JobSaver.update_task_status(task_info=tmp_task_info)
                 tmp_task_info["party_status"] = set_status
                 JobSaver.update_task_status(task_info=tmp_task_info)
Ejemplo n.º 8
0
 def read_metrics_from_db(self,
                          metric_namespace: str,
                          metric_name: str,
                          data_type,
                          job_level=False):
     metrics = []
     try:
         tracking_metric_model = self.get_model_class()
         tracking_metrics = tracking_metric_model.select(
             tracking_metric_model.f_key,
             tracking_metric_model.f_value).where(
                 tracking_metric_model.f_job_id == self.job_id,
                 tracking_metric_model.f_component_name == (
                     self.component_name if not job_level else
                     job_utils.job_pipeline_component_name()),
                 tracking_metric_model.f_role == self.role,
                 tracking_metric_model.f_party_id == self.party_id,
                 tracking_metric_model.f_metric_namespace ==
                 metric_namespace,
                 tracking_metric_model.f_metric_name == metric_name,
                 tracking_metric_model.f_type == data_type)
         for tracking_metric in tracking_metrics:
             yield deserialize_b64(tracking_metric.f_key), deserialize_b64(
                 tracking_metric.f_value)
     except Exception as e:
         schedule_logger(self.job_id).exception(e)
         raise e
     return metrics
Ejemplo n.º 9
0
 def insert_summary_into_db(self, summary_data: dict, need_serialize=True):
     try:
         summary_model = self.get_dynamic_db_model(ComponentSummary, self.job_id)
         DB.create_tables([summary_model])
         summary_obj = summary_model.get_or_none(
             summary_model.f_job_id == self.job_id,
             summary_model.f_component_name == self.component_name,
             summary_model.f_role == self.role,
             summary_model.f_party_id == self.party_id,
             summary_model.f_task_id == self.task_id,
             summary_model.f_task_version == self.task_version
         )
         if summary_obj:
             summary_obj.f_summary = serialize_b64(summary_data, to_str=True) if need_serialize else summary_data
             summary_obj.f_update_time = current_timestamp()
             summary_obj.save()
         else:
             self.get_dynamic_db_model(ComponentSummary, self.job_id).create(
                 f_job_id=self.job_id,
                 f_component_name=self.component_name,
                 f_role=self.role,
                 f_party_id=self.party_id,
                 f_task_id=self.task_id,
                 f_task_version=self.task_version,
                 f_summary=serialize_b64(summary_data, to_str=True),
                 f_create_time=current_timestamp()
             )
     except Exception as e:
         schedule_logger(self.job_id).exception("An exception where querying summary job id: {} "
                                                "component name: {} to database:\n{}".format(
             self.job_id, self.component_name, e)
         )
Ejemplo n.º 10
0
 def get_table_meta(self, table_info):
     schedule_logger(self.job_id).info(f'start get table meta:{table_info}')
     table_meta_dict = storage.StorageTableMeta(namespace=table_info.get("namespace"), name=table_info.get("table_name"), create_address=False).to_dict()
     schedule_logger(self.job_id).info(f'get table meta success: {table_meta_dict}')
     table_meta_dict["part_of_data"] = serialize_b64(table_meta_dict["part_of_data"], to_str=True)
     table_meta_dict["schema"] = serialize_b64(table_meta_dict["schema"], to_str=True)
     return table_meta_dict
Ejemplo n.º 11
0
 def save_metric_data(self, metric_namespace: str, metric_name: str, metrics: typing.List[Metric], job_level=False):
     schedule_logger(self.job_id).info(
         'save component {} on {} {} {} {} metric data'.format(self.component_name, self.role,
                                                               self.party_id, metric_namespace, metric_name))
     kv = []
     for metric in metrics:
         kv.append((metric.key, metric.value))
     self.metric_manager.insert_metrics_into_db(metric_namespace, metric_name, 1, kv, job_level)
Ejemplo n.º 12
0
    def run(self, task: Task, run_parameters, run_parameters_path, config_dir,
            log_dir, cwd_dir, **kwargs):
        spark_home = ServiceRegistry.FATE_ON_SPARK.get("spark", {}).get("home")
        if not spark_home:
            try:
                import pyspark
                spark_home = pyspark.__path__[0]
            except ImportError as e:
                raise RuntimeError("can not import pyspark")
            except Exception as e:
                raise RuntimeError("can not import pyspark")
        # else:
        #     raise ValueError(f"spark home must be configured in conf/service_conf.yaml when run on cluster mode")

        # additional configs
        spark_submit_config = run_parameters.spark_run

        deploy_mode = spark_submit_config.get("deploy-mode", "client")
        if deploy_mode not in ["client"]:
            raise ValueError(f"deploy mode {deploy_mode} not supported")

        spark_submit_cmd = os.path.join(spark_home, "bin/spark-submit")
        executable = [
            spark_submit_cmd, f"--name={task.f_task_id}#{task.f_role}"
        ]
        for k, v in spark_submit_config.items():
            if k != "conf":
                executable.append(f"--{k}={v}")
        if "conf" in spark_submit_config:
            for ck, cv in spark_submit_config["conf"].items():
                executable.append(f"--conf")
                executable.append(f"{ck}={cv}")
        extra_env = {}
        extra_env["SPARK_HOME"] = spark_home
        if DEPENDENT_DISTRIBUTION:
            dependence = Dependence()
            dependence.init(provider=ComponentProvider(**task.f_provider_info))
            executor_env_pythonpath, executor_python_env, driver_python_env, archives = dependence.get_task_dependence_info(
            )
            schedule_logger(task.f_job_id).info(
                f"executor_env_python {executor_python_env},"
                f"driver_env_python {driver_python_env}, archives {archives}")
            executable.append(f'--archives')
            executable.append(archives)
            executable.append(f'--conf')
            executable.append(f'spark.pyspark.python={executor_python_env}')
            executable.append(f'--conf')
            executable.append(
                f'spark.executorEnv.PYTHONPATH={executor_env_pythonpath}')
            executable.append(f'--conf')
            executable.append(
                f'spark.pyspark.driver.python={driver_python_env}')
        return WorkerManager.start_task_worker(
            worker_name=WorkerName.TASK_EXECUTOR,
            task=task,
            task_parameters=run_parameters,
            executable=executable,
            extra_env=extra_env)
Ejemplo n.º 13
0
def federated_coordination_on_grpc(job_id,
                                   method,
                                   host,
                                   port,
                                   endpoint,
                                   src_party_id,
                                   src_role,
                                   dest_party_id,
                                   json_body,
                                   api_version=API_VERSION,
                                   overall_timeout=None,
                                   try_times=3):
    overall_timeout = JobDefaultConfig.remote_request_timeout if overall_timeout is None else overall_timeout
    endpoint = f"/{api_version}{endpoint}"

    json_body['src_fate_ver'] = RuntimeConfig.get_env('FATE')
    json_body['src_role'] = src_role
    json_body['src_party_id'] = src_party_id

    if CHECK_NODES_IDENTITY:
        get_node_identity(json_body, src_party_id)
    _packet = wrap_grpc_packet(json_body,
                               method,
                               endpoint,
                               src_party_id,
                               dest_party_id,
                               job_id,
                               overall_timeout=overall_timeout)
    _routing_metadata = gen_routing_metadata(src_party_id=src_party_id,
                                             dest_party_id=dest_party_id)
    exception = None
    for t in range(try_times):
        try:
            channel, stub = get_command_federation_channel(host, port)
            _return, _call = stub.unaryCall.with_call(
                _packet,
                metadata=_routing_metadata,
                timeout=(overall_timeout / 1000))
            audit_logger(job_id).info("grpc api response: {}".format(_return))
            channel.close()
            response = json_loads(_return.body.value)
            return response
        except Exception as e:
            exception = e
            schedule_logger(job_id).warning(
                f"remote request {endpoint} error, sleep and try again")
            time.sleep(2 * (t + 1))
    else:
        tips = 'Please check rollSite and fateflow network connectivity'
        """
        if 'Error received from peer' in str(exception):
            tips = 'Please check if the fate flow server of the other party is started. '
        if 'failed to connect to all addresses' in str(exception):
            tips = 'Please check whether the rollsite service(port: 9370) is started. '
        """
        raise Exception('{}rpc request error: {}'.format(tips, exception))
Ejemplo n.º 14
0
 def finish(cls, job, end_status):
     schedule_logger(job.f_job_id).info(
         f"job finished with {end_status}, do something...")
     cls.stop_job(job_id=job.f_job_id,
                  role=job.f_initiator_role,
                  party_id=job.f_initiator_party_id,
                  stop_status=end_status)
     FederatedScheduler.clean_job(job=job)
     schedule_logger(
         job.f_job_id).info(f"job finished with {end_status}, done")
Ejemplo n.º 15
0
def check_job_is_timeout(job: Job):
    job_parameters = job.f_runtime_conf_on_party["job_parameters"]
    timeout = job_parameters.get("timeout", JobDefaultConfig.job_timeout)
    now_time = current_timestamp()
    running_time = (now_time - job.f_create_time) / 1000
    if running_time > timeout:
        schedule_logger(job.f_job_id).info(f'run time {running_time}s timeout')
        return True
    else:
        return False
Ejemplo n.º 16
0
 def save_table_meta(self, meta):
     schedule_logger(self.job_id).info(f'start save table meta:{meta}')
     address = storage.StorageTableMeta.create_address(storage_engine=meta.get("engine"),
                                                       address_dict=meta.get("address"))
     table_meta = storage.StorageTableMeta(name=meta.get("name"), namespace=meta.get("namespace"), new=True)
     table_meta.set_metas(**meta)
     meta["address"] = address
     meta["part_of_data"] = deserialize_b64(meta["part_of_data"])
     meta["schema"] = deserialize_b64(meta["schema"])
     table_meta.create()
     schedule_logger(self.job_id).info(f'save table meta success')
Ejemplo n.º 17
0
 def tracking_output_cache(self, cache: DataCache, cache_name: str) -> str:
     cache_key = CacheManager.record(cache=cache,
                                     job_id=self.job_id,
                                     role=self.role,
                                     party_id=self.party_id,
                                     component_name=self.component_name,
                                     task_id=self.task_id,
                                     task_version=self.task_version,
                                     cache_name=cache_name)
     schedule_logger(self.job_id).info(f"tracking {self.task_id} {self.task_version} output cache, cache key is {cache_key}")
     return cache_key
Ejemplo n.º 18
0
 def job_reload(cls, job):
     schedule_logger(job.f_job_id).info(f"start job reload")
     cls.log_reload(job)
     source_inheritance_tasks, target_inheritance_tasks = cls.load_source_target_tasks(
         job)
     schedule_logger(job.f_job_id).info(
         f"source_inheritance_tasks:{source_inheritance_tasks}, target_inheritance_tasks:{target_inheritance_tasks}"
     )
     cls.output_reload(job, source_inheritance_tasks,
                       target_inheritance_tasks)
     cls.status_reload(job, source_inheritance_tasks,
                       target_inheritance_tasks)
Ejemplo n.º 19
0
 def read_component_metrics(self):
     try:
         tracking_metric_model = self.get_model_class()
         tracking_metrics = tracking_metric_model.select().where(
             tracking_metric_model.f_job_id == self.job_id,
             tracking_metric_model.f_component_name == self.component_name,
             tracking_metric_model.f_role == self.role,
             tracking_metric_model.f_party_id == self.party_id,
             tracking_metric_model.f_task_version == self.task_version)
         return [tracking_metric for tracking_metric in tracking_metrics]
     except Exception as e:
         schedule_logger(self.job_id).exception(e)
         raise e
Ejemplo n.º 20
0
def stop_job():
    job_id = request.json.get('job_id')
    stop_status = request.json.get("stop_status", "canceled")
    jobs = JobSaver.query_job(job_id=job_id)
    if jobs:
        schedule_logger(job_id).info(f"stop job on this party")
        kill_status, kill_details = JobController.stop_jobs(
            job_id=job_id, stop_status=stop_status)
        schedule_logger(job_id).info(
            f"stop job on this party status {kill_status}")
        schedule_logger(job_id).info(f"request stop job to {stop_status}")
        status_code, response = FederatedScheduler.request_stop_job(
            job=jobs[0],
            stop_status=stop_status,
            command_body=jobs[0].to_dict())
        if status_code == FederatedSchedulingStatusCode.SUCCESS:
            return get_json_result(
                retcode=RetCode.SUCCESS,
                retmsg=
                f"stop job on this party {kill_status}; stop job on all party success"
            )
        else:
            return get_json_result(
                retcode=RetCode.OPERATING_ERROR,
                retmsg=f"stop job on this party {kill_status}",
                data=response)
    else:
        schedule_logger(job_id).info(f"can not found job to stop")
        return get_json_result(retcode=RetCode.DATA_ERROR,
                               retmsg="can not found job")
Ejemplo n.º 21
0
 def load_task_tracker(cls, tasks: dict):
     tracker_dict = {}
     for key, task in tasks.items():
         schedule_logger(task.f_job_id).info(
             f"task:{task.f_job_id}, {task.f_role}, {task.f_party_id},{task.f_component_name},{task.f_task_version}"
         )
         tracker = Tracker(job_id=task.f_job_id,
                           role=task.f_role,
                           party_id=task.f_party_id,
                           component_name=task.f_component_name,
                           task_id=task.f_task_id,
                           task_version=task.f_task_version)
         tracker_dict[key] = tracker
     return tracker_dict
Ejemplo n.º 22
0
 def bulk_insert_into_db(self, model, data_source):
     try:
         try:
             DB.create_tables([model])
         except Exception as e:
             schedule_logger(self.job_id).exception(e)
         batch_size = 50 if RuntimeConfig.USE_LOCAL_DATABASE else 1000
         for i in range(0, len(data_source), batch_size):
             with DB.atomic():
                 model.insert_many(data_source[i:i+batch_size]).execute()
         return len(data_source)
     except Exception as e:
         schedule_logger(self.job_id).exception(e)
         return 0
Ejemplo n.º 23
0
 def update_task(cls, task_info):
     """
     Save to local database and then report to Initiator
     :param task_info:
     :return:
     """
     update_status = False
     try:
         update_status = JobSaver.update_task(task_info=task_info)
         cls.report_task_to_initiator(task_info=task_info)
     except Exception as e:
         schedule_logger(task_info["job_id"]).exception(e)
     finally:
         return update_status
Ejemplo n.º 24
0
 def get_remaining_resource(cls,
                            resource_model: typing.Union[EngineRegistry,
                                                         Job], filters):
     remaining_cores, remaining_memory = None, None
     try:
         objs = resource_model.select(
             resource_model.f_remaining_cores,
             resource_model.f_remaining_memory).where(*filters)
         if objs:
             remaining_cores, remaining_memory = objs[
                 0].f_remaining_cores, objs[0].f_remaining_memory
     except Exception as e:
         schedule_logger().exception(e)
     finally:
         return remaining_cores, remaining_memory
Ejemplo n.º 25
0
    def status_reload(cls, job, source_tasks, target_tasks):
        schedule_logger(job.f_job_id).info("start reload status")
        # update task status
        for key, source_task in source_tasks.items():
            JobSaver.reload_task(source_task, target_tasks[key])

        # update job status
        JobSaver.update_job(
            job_info={
                "job_id": job.f_job_id,
                "role": job.f_role,
                "party_id": job.f_party_id,
                "inheritance_status": JobInheritanceStatus.SUCCESS
            })
        schedule_logger(job.f_job_id).info("reload status success")
Ejemplo n.º 26
0
def federated_coordination_on_http(job_id,
                                   method,
                                   host,
                                   port,
                                   endpoint,
                                   src_party_id,
                                   src_role,
                                   dest_party_id,
                                   json_body,
                                   api_version=API_VERSION,
                                   overall_timeout=None,
                                   try_times=3):
    overall_timeout = JobDefaultConfig.remote_request_timeout if overall_timeout is None else overall_timeout
    endpoint = f"/{api_version}{endpoint}"
    exception = None

    json_body['src_fate_ver'] = RuntimeConfig.get_env('FATE')
    json_body['src_role'] = src_role
    json_body['src_party_id'] = src_party_id

    for t in range(try_times):
        try:
            url = "http://{}:{}{}".format(host, port, endpoint)
            audit_logger(job_id).info(
                'remote http api request: {}'.format(url))

            headers = HEADERS.copy()
            headers["dest-party-id"] = str(dest_party_id)
            headers["src-fate-ver"] = RuntimeConfig.get_env("FATE")
            headers["src-party-id"] = str(src_party_id)
            headers["src-role"] = str(src_role)

            response = request(method=method,
                               url=url,
                               json=json_body,
                               headers=headers)
            audit_logger(job_id).info(response.text)
            audit_logger(job_id).info('remote http api response: {} {}'.format(
                endpoint, response.json()))

            return response.json()
        except Exception as e:
            exception = e
            schedule_logger(job_id).warning(
                f"remote http request {endpoint} error, sleep and try again")
            time.sleep(2 * (t + 1))
    else:
        raise exception
Ejemplo n.º 27
0
 def log_reload(cls, job):
     schedule_logger(job.f_job_id).info("start reload job log")
     if job.f_inheritance_info:
         for component_name in job.f_inheritance_info.get("component_list"):
             source_path = os.path.join(
                 log_utils.get_logger_base_dir(),
                 job.f_inheritance_info.get("job_id"), job.f_role,
                 job.f_party_id, component_name)
             target_path = os.path.join(log_utils.get_logger_base_dir(),
                                        job.f_job_id, job.f_role,
                                        job.f_party_id, component_name)
             if os.path.exists(source_path):
                 if os.path.exists(target_path):
                     shutil.rmtree(target_path)
                 shutil.copytree(source_path, target_path)
     schedule_logger(job.f_job_id).info("reload job log success")
Ejemplo n.º 28
0
 def check_job_inherit_dependence(cls, job):
     schedule_logger(job.f_job_id).info(
         f"check job inherit dependence: {job.f_inheritance_info}, {job.f_inheritance_status}"
     )
     if job.f_inheritance_info:
         if job.f_inheritance_status == JobInheritanceStatus.WAITING:
             cls.start_inheriting_job(job)
             return False
         elif job.f_inheritance_status == JobInheritanceStatus.RUNNING:
             return False
         elif job.f_inheritance_status == JobInheritanceStatus.FAILED:
             raise Exception("job inheritance failed")
         else:
             return True
     else:
         return True
Ejemplo n.º 29
0
 def fill_default_job_parameters(cls, job_id,
                                 job_parameters: RunParameters):
     keys = {
         "task_parallelism", "auto_retries", "auto_retry_delay",
         "federated_status_collect_type"
     }
     for key in keys:
         if hasattr(job_parameters,
                    key) and getattr(job_parameters, key) is None:
             if hasattr(JobDefaultConfig, key):
                 setattr(job_parameters, key,
                         getattr(JobDefaultConfig, key))
             else:
                 schedule_logger(job_id).warning(
                     f"can not found {key} job parameter default value from job_default_settings"
                 )
Ejemplo n.º 30
0
 def read_summary_from_db(self, need_deserialize=True):
     try:
         summary_model = self.get_dynamic_db_model(ComponentSummary, self.job_id)
         summary = summary_model.get_or_none(
             summary_model.f_job_id == self.job_id,
             summary_model.f_component_name == self.component_name,
             summary_model.f_role == self.role,
             summary_model.f_party_id == self.party_id
         )
         if summary:
             cpn_summary = deserialize_b64(summary.f_summary) if need_deserialize else summary.f_summary
         else:
             cpn_summary = ""
     except Exception as e:
         schedule_logger(self.job_id).exception(e)
         raise e
     return cpn_summary