def insert_summary_into_db(self, summary_data: dict): try: summary_model = self.get_dynamic_db_model(ComponentSummary, self.job_id) DB.create_tables([summary_model]) summary_obj = summary_model.get_or_none( summary_model.f_job_id == self.job_id, summary_model.f_component_name == self.component_name, summary_model.f_role == self.role, summary_model.f_party_id == self.party_id, summary_model.f_task_id == self.task_id, summary_model.f_task_version == self.task_version) if summary_obj: summary_obj.f_summary = serialize_b64(summary_data, to_str=True) summary_obj.f_update_time = current_timestamp() summary_obj.save() else: self.get_dynamic_db_model( ComponentSummary, self.job_id).create(f_job_id=self.job_id, f_component_name=self.component_name, f_role=self.role, f_party_id=self.party_id, f_task_id=self.task_id, f_task_version=self.task_version, f_summary=serialize_b64(summary_data, to_str=True), f_create_time=current_timestamp()) except Exception as e: schedule_logger(self.job_id).exception( "An exception where querying summary job id: {} " "component name: {} to database:\n{}".format( self.job_id, self.component_name, e))
def drop_metric_data_mode(model): try: drop_sql = 'drop table t_tracking_metric_{}'.format(model) DB.execute_sql(drop_sql) stat_logger.info(drop_sql) return drop_sql except Exception as e: stat_logger.exception(e) raise e
def get_mysql_info(): if IS_STANDALONE: return error_response(404, 'mysql only available on cluster mode') try: with DB.connection_context(): DB.random() except Exception as e: return error_response(503, str(e)) return error_response(200)
def bulk_insert_model_data(self, model, data_source): with DB.connection_context(): try: DB.create_tables([model]) batch_size = 50 if RuntimeConfig.USE_LOCAL_DATABASE else 1000 for i in range(0, len(data_source), batch_size): with DB.atomic(): model.insert_many(data_source[i:i+batch_size]).execute() return len(data_source) except Exception as e: schedule_logger(self.job_id).exception(e) return 0
def get_metric_list(self, job_level: bool = False): with DB.connection_context(): metrics = dict() query_sql = 'select distinct f_metric_namespace, f_metric_name from t_tracking_metric_{} where ' \ 'f_job_id = "{}" and f_component_name = "{}" and f_role = "{}" and f_party_id = "{}" ' \ 'and f_task_id = "{}"'.format( self.get_table_index(), self.job_id, self.component_name if not job_level else 'dag', self.role, self.party_id, self.task_id) cursor = DB.execute_sql(query_sql) for row in cursor.fetchall(): metrics[row[0]] = metrics.get(row[0], []) metrics[row[0]].append(row[1]) return metrics
def bulk_insert_into_db(model, data_source, logger): try: try: DB.create_tables([model]) except Exception as e: logger.exception(e) batch_size = 50 if RuntimeConfig.USE_LOCAL_DATABASE else 1000 for i in range(0, len(data_source), batch_size): with DB.atomic(): model.insert_many(data_source[i:i + batch_size]).execute() return len(data_source) except Exception as e: logger.exception(e) return 0
def read_data_from_db(self, metric_namespace: str, metric_name: str, data_type, job_level=False): with DB.connection_context(): metrics = [] try: query_sql = 'select f_key, f_value from t_tracking_metric_{} where ' \ 'f_job_id = "{}" and f_component_name = "{}" and f_role = "{}" and f_party_id = "{}"' \ 'and f_task_id = "{}" and f_metric_namespace = "{}" and f_metric_name= "{}" and f_type="{}" order by f_id'.format( self.get_table_index(), self.job_id, self.component_name if not job_level else 'dag', self.role, self.party_id, self.task_id, metric_namespace, metric_name, data_type) cursor = DB.execute_sql(query_sql) for row in cursor.fetchall(): yield deserialize_b64(row[0]), deserialize_b64(row[1]) except Exception as e: schedule_logger(self.job_id).exception(e) return metrics
def do_load_model(): request_data = request.json adapter_servings_config(request_data) retcode, retmsg = publish_model.load_model(config_data=request_data) try: if not retcode: with DB.connection_context(): model = MLModel.get_or_none(MLModel.f_role == request_data.get("local").get("role"), MLModel.f_party_id == request_data.get("local").get("party_id"), MLModel.f_model_id == request_data.get("job_parameters").get("model_id"), MLModel.f_model_version == request_data.get("job_parameters").get("model_version")) if model: count = model.f_loaded_times model.f_loaded_times = count + 1 model.save() except Exception as modify_err: stat_logger.exception(modify_err) try: party_model_id = gen_party_model_id(role=request_data.get("local").get("role"), party_id=request_data.get("local").get("party_id"), model_id=request_data.get("job_parameters").get("model_id")) src_model_path = os.path.join(file_utils.get_project_base_directory(), 'model_local_cache', party_model_id, request_data.get("job_parameters").get("model_version")) dst_model_path = os.path.join(file_utils.get_project_base_directory(), 'loaded_model_backup', party_model_id, request_data.get("job_parameters").get("model_version")) if not os.path.exists(dst_model_path): shutil.copytree(src=src_model_path, dst=dst_model_path) except Exception as copy_err: stat_logger.exception(copy_err) operation_record(request_data, "load", "success" if not retcode else "failed") return get_json_result(retcode=retcode, retmsg=retmsg)
def get(self, block=True, timeout=None): with self.not_empty: if not block: if not self.query_events(): raise Exception elif timeout is None: while not self.query_events(): self.not_empty.wait() elif timeout < 0: raise ValueError("'timeout' must be a non-negative number") else: endtime = time() + timeout while not self.query_events(): remaining = endtime - time() if remaining <= 0.0: raise Exception self.not_empty.wait(remaining) with DB.connection_context(): error = None MysqlQueue.lock(DB, 'fate_flow_job_queue', 10) try: item = Queue.select().where(Queue.f_is_waiting == 1)[0] if item: self.update_event(item.f_job_id) except Exception as e: error = e MysqlQueue.unlock(DB, 'fate_flow_job_queue') if error: raise Exception(e) self.not_full.notify() return json.loads(item.f_event)
def delete_metric_data_from_db(metric_info): try: job_id = metric_info['job_id'] metric_info.pop('job_id') delete_sql = 'delete from t_tracking_metric_{} where f_job_id="{}"'.format( job_id[:8], job_id) for k, v in metric_info.items(): if hasattr(TrackingMetric, "f_" + k): connect_str = " and f_" delete_sql = delete_sql + connect_str + k + '="{}"'.format(v) DB.execute_sql(delete_sql) stat_logger.info(delete_sql) return delete_sql except Exception as e: stat_logger.exception(e) raise e
def bind_model_service(): request_config = request.json if request_config.get('job_id', None): with DB.connection_context(): model = MLModel.get_or_none( MLModel.f_job_id == request_config.get("job_id"), MLModel.f_role == 'guest' ) if model: model_info = model.to_json() request_config['initiator'] = {} request_config['initiator']['party_id'] = str(model_info.get('f_initiator_party_id')) request_config['initiator']['role'] = model_info.get('f_initiator_role') request_config['job_parameters'] = model_info.get('f_runtime_conf').get('job_parameters') request_config['role'] = model_info.get('f_runtime_conf').get('role') for key, value in request_config['role'].items(): for i, v in enumerate(value): value[i] = str(v) request_config.pop('job_id') else: return get_json_result(retcode=101, retmsg="model {} can not be found in database. " "Please check if the model version is valid.".format(request_config.get('job_id'))) if not request_config.get('servings'): # get my party all servings adapter_servings_config(request_config) service_id = request_config.get('service_id') if not service_id: return get_json_result(retcode=101, retmsg='no service id') check_config(request_config, ['initiator', 'role', 'job_parameters']) bind_status, retmsg = publish_model.bind_model_service(config_data=request_config) operation_record(request_config, "bind", "success" if not bind_status else "failed") return get_json_result(retcode=bind_status, retmsg='service id is {}'.format(service_id) if not retmsg else retmsg)
def insert_data_to_db(self, metric_namespace: str, metric_name: str, data_type: int, kv, job_level=False): with DB.connection_context(): try: tracking_metric = TrackingMetric.model(table_index=self.job_id) tracking_metric.f_job_id = self.job_id tracking_metric.f_component_name = self.component_name if not job_level else 'dag' tracking_metric.f_task_id = self.task_id tracking_metric.f_role = self.role tracking_metric.f_party_id = self.party_id tracking_metric.f_metric_namespace = metric_namespace tracking_metric.f_metric_name = metric_name tracking_metric.f_type = data_type default_db_source = tracking_metric.to_json() tracking_metric_data_source = [] for k, v in kv: db_source = default_db_source.copy() db_source['f_key'] = serialize_b64(k) db_source['f_value'] = serialize_b64(v) db_source['f_create_time'] = current_timestamp() tracking_metric_data_source.append(db_source) self.bulk_insert_model_data( TrackingMetric.model(table_index=self.get_table_index()), tracking_metric_data_source) except Exception as e: schedule_logger(self.job_id).exception(e)
def save_data_view(self, role, party_id, data_info, mark=False): with DB.connection_context(): data_views = DataView.select().where( DataView.f_job_id == self.job_id, DataView.f_component_name == self.component_name, DataView.f_task_id == self.task_id, DataView.f_role == role, DataView.f_party_id == party_id) is_insert = True if mark and self.component_name == "upload_0": return if data_views: data_view = data_views[0] is_insert = False else: data_view = DataView() data_view.f_create_time = current_timestamp() data_view.f_job_id = self.job_id data_view.f_component_name = self.component_name data_view.f_task_id = self.task_id data_view.f_role = role data_view.f_party_id = party_id data_view.f_update_time = current_timestamp() for k, v in data_info.items(): if k in [ 'f_job_id', 'f_component_name', 'f_task_id', 'f_role', 'f_party_id' ] or v == getattr(DataView, k).default: continue setattr(data_view, k, v) if is_insert: data_view.save(force_insert=True) else: data_view.save() return data_view
def do_load_model(): request_data = request.json request_data['servings'] = RuntimeConfig.SERVICE_DB.get_urls('servings') role = request_data['local']['role'] party_id = request_data['local']['party_id'] model_id = request_data['job_parameters']['model_id'] model_version = request_data['job_parameters']['model_version'] party_model_id = model_utils.gen_party_model_id(model_id, role, party_id) if get_base_config('enable_model_store', False): pipeline_model = pipelined_model.PipelinedModel( party_model_id, model_version) component_parameters = { 'model_id': party_model_id, 'model_version': model_version, 'store_address': ServiceRegistry.MODEL_STORE_ADDRESS, } model_storage = get_model_storage(component_parameters) if pipeline_model.exists() and not model_storage.exists( **component_parameters): stat_logger.info( f'Uploading {pipeline_model.model_path} to model storage.') model_storage.store(**component_parameters) elif not pipeline_model.exists() and model_storage.exists( **component_parameters): stat_logger.info( f'Downloading {pipeline_model.model_path} from model storage.') model_storage.restore(**component_parameters) if not model_utils.check_if_deployed(role, party_id, model_id, model_version): return get_json_result( retcode=100, retmsg= "Only deployed models could be used to execute process of loading. " "Please deploy model before loading.") retcode, retmsg = publish_model.load_model(request_data) try: if not retcode: with DB.connection_context(): model = MLModel.get_or_none( MLModel.f_role == request_data["local"]["role"], MLModel.f_party_id == request_data["local"]["party_id"], MLModel.f_model_id == request_data["job_parameters"] ["model_id"], MLModel.f_model_version == request_data["job_parameters"]["model_version"]) if model: model.f_loaded_times += 1 model.save() except Exception as modify_err: stat_logger.exception(modify_err) operation_record(request_data, "load", "success" if not retcode else "failed") return get_json_result(retcode=retcode, retmsg=retmsg)
def resource_for_job(cls, job_id, role, party_id, operation_type): operate_status = False engine_name, cores, memory = cls.calculate_job_resource(job_id=job_id, role=role, party_id=party_id) try: with DB.atomic(): updates = { Job.f_engine_type: EngineType.COMPUTING, Job.f_engine_name: engine_name, Job.f_cores: cores, Job.f_memory: memory, } filters = [ Job.f_job_id == job_id, Job.f_role == role, Job.f_party_id == party_id, ] if operation_type == ResourceOperation.APPLY: updates[Job.f_remaining_cores] = cores updates[Job.f_remaining_memory] = memory updates[Job.f_resource_in_use] = True updates[Job.f_apply_resource_time] = base_utils.current_timestamp() filters.append(Job.f_resource_in_use == False) elif operation_type == ResourceOperation.RETURN: updates[Job.f_resource_in_use] = False updates[Job.f_return_resource_time] = base_utils.current_timestamp() filters.append(Job.f_resource_in_use == True) operate = Job.update(updates).where(*filters) record_status = operate.execute() > 0 if not record_status: raise RuntimeError(f"record job {job_id} resource {operation_type} failed on {role} {party_id}") filters, updates = cls.update_resource_sql(resource_model=EngineRegistry, cores=cores, memory=memory, operation_type=operation_type, ) filters.append(EngineRegistry.f_engine_type == EngineType.COMPUTING) filters.append(EngineRegistry.f_engine_name == engine_name) operate = EngineRegistry.update(updates).where(*filters) apply_status = operate.execute() > 0 if not apply_status: raise RuntimeError( f"{operation_type} resource from engine {engine_name} for job {job_id} resource {operation_type} failed on {role} {party_id}") operate_status = True except Exception as e: schedule_logger(job_id=job_id).warning(e) schedule_logger(job_id=job_id).warning( f"{operation_type} job {job_id} resource(cores {cores} memory {memory}) on {role} {party_id} failed") operate_status = False finally: remaining_cores, remaining_memory = cls.get_remaining_resource(EngineRegistry, [ EngineRegistry.f_engine_type == EngineType.COMPUTING, EngineRegistry.f_engine_name == engine_name]) operate_msg = "successfully" if operate_status else "failed" schedule_logger(job_id=job_id).info( f"{operation_type} job {job_id} resource(cores {cores} memory {memory}) on {role} {party_id} {operate_msg}, remaining cores: {remaining_cores} remaining memory: {remaining_memory}") return operate_status
def get_job_view(self): with DB.connection_context(): view_data = {} for k, v in self.read_data_from_db('job', 'job_view', 2, job_level=True): view_data[k] = v return view_data
def save_job_info(self, role, party_id, job_info, create=False): with DB.connection_context(): schedule_logger(self.job_id).info('save {} {} job: {}'.format( role, party_id, job_info)) jobs = Job.select().where(Job.f_job_id == self.job_id, Job.f_role == role, Job.f_party_id == party_id) is_insert = True if jobs: job = jobs[0] is_insert = False if job.f_status == JobStatus.TIMEOUT: return None elif create: job = Job() job.f_create_time = current_timestamp() else: return None job.f_job_id = self.job_id job.f_role = role job.f_party_id = party_id if 'f_status' in job_info: if job.f_status in [JobStatus.COMPLETE, JobStatus.FAILED]: # Termination status cannot be updated # TODO: return if (job_info['f_status'] in [ JobStatus.FAILED, JobStatus.TIMEOUT ]) and (not job.f_end_time): if not job.f_start_time: return job_info['f_end_time'] = current_timestamp() job_info['f_elapsed'] = job_info[ 'f_end_time'] - job.f_start_time job_info['f_update_time'] = current_timestamp() if (job_info['f_status'] in [ JobStatus.FAILED, JobStatus.TIMEOUT, JobStatus.CANCELED, JobStatus.COMPLETE ]): job_info['f_tag'] = 'job_end' update_fields = [] for k, v in job_info.items(): try: if k in ['f_job_id', 'f_role', 'f_party_id' ] or v == getattr(Job, k).default: continue setattr(job, k, v) update_fields.append(getattr(Job, k)) except: pass if is_insert: job.save(force_insert=True) else: job.save(only=update_fields)
def read_metric_data(self, metric_namespace: str, metric_name: str, job_level=False): with DB.connection_context(): metrics = [] for k, v in self.read_data_from_db(metric_namespace, metric_name, 1, job_level): metrics.append(Metric(key=k, value=v)) return metrics
def get_job_configuration(job_id, role, party_id): with DB.connection_context(): jobs = Job.select(Job.f_dsl, Job.f_runtime_conf, Job.f_train_runtime_conf).where( Job.f_job_id == job_id, Job.f_role == role, Job.f_party_id == party_id) if jobs: job = jobs[0] return json_loads(job.f_dsl), json_loads( job.f_runtime_conf), json_loads(job.f_train_runtime_conf) else: return {}, {}, {}
def get_metric_meta(self, metric_namespace: str, metric_name: str, job_level: bool = False): with DB.connection_context(): kv = dict() for k, v in self.read_data_from_db(metric_namespace, metric_name, 0, job_level): kv[k] = v return MetricMeta(name=kv.get('name'), metric_type=kv.get('metric_type'), extra_metas=kv)
def query_task(**kwargs): with DB.connection_context(): filters = [] for f_n, f_v in kwargs.items(): attr_name = 'f_%s' % f_n if hasattr(Task, attr_name): filters.append(operator.attrgetter('f_%s' % f_n)(Task) == f_v) if filters: tasks = Task.select().where(*filters) else: tasks = Task.select() return [task for task in tasks]
def put(self, item, block=True, timeout=None): with self.not_full: with DB.connection_context(): error = None MysqlQueue.lock(DB, 'fate_flow_job_queue', 10) try: self.update_event(item=item) except Exception as e: error =e MysqlQueue.unlock(DB, 'fate_flow_job_queue') if error: raise Exception(e) self.not_empty.notify()
def set_status(self, status=None, job_id=None): is_failed = False with DB.connection_context(): error = None MysqlQueue.lock(DB, 'fate_flow_job_queue', 10) try: is_failed = self.update_event(status=status, job_id=job_id) except Exception as e: error = e MysqlQueue.unlock(DB, 'fate_flow_job_queue') if error: raise Exception(e) return is_failed
def query_data_view(**kwargs): with DB.connection_context(): filters = [] for f_n, f_v in kwargs.items(): attr_name = 'f_%s' % f_n if hasattr(DataView, attr_name): filters.append( operator.attrgetter('f_%s' % f_n)(DataView) == f_v) if filters: data_views = DataView.select().where(*filters) else: data_views = [] return [data_view for data_view in data_views]
def check_request_parameters(request_data): with DB.connection_context(): if 'role' not in request_data and 'party_id' not in request_data: jobs = Job.select(Job.f_runtime_conf).where(Job.f_job_id == request_data.get('job_id', ''), Job.f_is_initiator == 1) if jobs: job = jobs[0] job_runtime_conf = json_loads(job.f_runtime_conf) job_initiator = job_runtime_conf.get('initiator', {}) role = job_initiator.get('role', '') party_id = job_initiator.get('party_id', 0) request_data['role'] = role request_data['party_id'] = party_id
def query_job(**kwargs): with DB.connection_context(): filters = [] for f_n, f_v in kwargs.items(): attr_name = 'f_%s' % f_n if hasattr(Job, attr_name): filters.append(operator.attrgetter('f_%s' % f_n)(Job) == f_v) if filters: jobs = Job.select().where(*filters) return [job for job in jobs] else: # not allow query all job return []
def get_job_dsl_parser_by_job_id(job_id): with DB.connection_context(): jobs = Job.select( Job.f_dsl, Job.f_runtime_conf, Job.f_train_runtime_conf).where(Job.f_job_id == job_id) if jobs: job = jobs[0] job_dsl_parser = get_job_dsl_parser( dsl=json_loads(job.f_dsl), runtime_conf=json_loads(job.f_runtime_conf), train_runtime_conf=json_loads(job.f_train_runtime_conf)) return job_dsl_parser else: return None
def save_to_db(cls): # save component registry info with DB.lock("component_register"): for provider_name, provider_group_info in cls.REGISTRY[ "providers"].items(): for version, version_register_info in provider_group_info.items( ): if version != "default": version_info = { "f_path": version_register_info.get("path"), "f_python": version_register_info.get("python", ""), "f_class_path": version_register_info.get("class_path"), "f_version": version, "f_provider_name": provider_name } cls.safe_save(ComponentProviderInfo, version_info, f_version=version, f_provider_name=provider_name) for component_name, component_info in version_register_info.get( "components").items(): component_registry_info = { "f_version": version, "f_provider_name": provider_name, "f_component_name": component_name, "f_module": component_info.get("module") } cls.safe_save(ComponentRegistryInfo, component_registry_info, f_version=version, f_provider_name=provider_name, f_component_name=component_name) for component_name, info in cls.REGISTRY["components"].items(): component_info = { "f_component_name": component_name, "f_default_provider": info.get("default_provider"), "f_support_provider": info.get("support_provider"), "f_component_alias": info.get("alias"), } cls.safe_save(ComponentInfo, component_info, f_component_name=component_name)
def dell(self, item): with self.not_empty: with DB.connection_context(): MysqlQueue.lock(DB, 'fate_flow_job_queue', 10) del_status = True try: job_id = item.get('job_id') event = Queue.select().where(Queue.f_job_id == job_id)[0] if event.f_is_waiting != 1: del_status = False event.f_is_waiting = 2 event.save() except Exception as e: stat_logger.exception(e) del_status = False MysqlQueue.unlock(DB, 'fate_flow_job_queue') self.not_full.notify() return del_status
def put(self, item, block=True, timeout=None, status=None, job_id=''): is_failed = False with self.not_full: with DB.connection_context(): error = None MysqlQueue.lock(DB, 'fate_flow_job_queue', 10) try: is_failed = self.update_event(item=item, status=status, job_id=job_id, operating='put') except Exception as e: error = e MysqlQueue.unlock(DB, 'fate_flow_job_queue') if error: raise Exception(error) self.not_empty.notify() return is_failed