def get_table_meta(self, table_info): schedule_logger(self.job_id).info(f'start get table meta:{table_info}') table_meta_dict = storage.StorageTableMeta(namespace=table_info.get("namespace"), name=table_info.get("table_name"), create_address=False).to_dict() schedule_logger(self.job_id).info(f'get table meta success: {table_meta_dict}') table_meta_dict["part_of_data"] = serialize_b64(table_meta_dict["part_of_data"], to_str=True) table_meta_dict["schema"] = serialize_b64(table_meta_dict["schema"], to_str=True) return table_meta_dict
def table_api(table_func): config = request.json if table_func == 'table_info': table_key_count = 0 table_partition = None table_schema = None table_name, namespace = config.get("name") or config.get( "table_name"), config.get("namespace") table_meta = storage.StorageTableMeta(name=table_name, namespace=namespace) if table_meta: table_key_count = table_meta.get_count() table_partition = table_meta.get_partitions() table_schema = table_meta.get_schema() exist = 1 else: exist = 0 return get_json_result( data={ "table_name": table_name, "namespace": namespace, "exist": exist, "count": table_key_count, "partition": table_partition, "schema": table_schema }) else: return get_json_result()
def write_to_db(conf, table_name, file_name, namespace, partitions, head): db = MysqldbHelper(**conf) table_meta = storage.StorageTableMeta(name=table_name, namespace=namespace) create_table = 'create table {}(id varchar(50) NOT NULL, features LONGTEXT, PRIMARY KEY(id))'.format(table_name) db.execute(create_table.format(table_name)) print('create table {}'.format(table_name)) with open(file_name, 'r') as f: if head: data_head = f.readline() header_source_item = data_head.split(',') table_meta.update_metas(schema={'header': ','.join(header_source_item[1:]).strip(), 'sid': header_source_item[0]}) n = 0 count = 0 while True: data = list() lines = f.readlines(12400) if lines: sql = 'REPLACE INTO {}(id, features) VALUES'.format(table_name) for line in lines: count += 1 values = line.replace("\n", "").replace("\t", ",").split(",") data.append((values[0], list_to_str(values[1:]))) sql += '("{}", "{}"),'.format(values[0], list_to_str(values[1:])) sql = ','.join(sql.split(',')[:-1]) + ';' if n == 0: table_meta.update_metas(part_of_data=data, partitions=partitions) n +=1 db.execute(sql) db.con.commit() else: break table_meta.update_metas(count=count)
def get_output_data_table(self, output_data_infos, tracker_client=None): """ Get component output data table, will run in the task executor process :param output_data_infos: :return: """ output_tables_meta = {} if output_data_infos: for output_data_info in output_data_infos: schedule_logger(self.job_id).info( "Get task {} {} output table {} {}".format( output_data_info.f_task_id, output_data_info.f_task_version, output_data_info.f_table_namespace, output_data_info.f_table_name)) if not tracker_client: data_table_meta = storage.StorageTableMeta( name=output_data_info.f_table_name, namespace=output_data_info.f_table_namespace) else: data_table_meta = tracker_client.get_table_meta( output_data_info.f_table_name, output_data_info.f_table_namespace) output_tables_meta[ output_data_info.f_data_name] = data_table_meta return output_tables_meta
def get_computing_table(self, name, namespace, schema=None): storage_table_meta = storage.StorageTableMeta(name=name, namespace=namespace) computing_table = session.get_computing_session().load( storage_table_meta.get_address(), schema=schema if schema else storage_table_meta.get_schema(), partitions=self.parameters.get("partitions")) return computing_table
def load(cls, cache: DataCache) -> typing.Tuple[typing.Dict[str, CTableABC], dict]: cache_data = {} for name, table in cache.data.items(): storage_table_meta = storage.StorageTableMeta(name=table.name, namespace=table.namespace) computing_table = session.get_computing_session().load( storage_table_meta.get_address(), schema=storage_table_meta.get_schema(), partitions=table.partitions) cache_data[name] = computing_table return cache_data, cache.meta
def download_upload(access_module): job_id = job_utils.generate_job_id() if access_module == "upload" and UPLOAD_DATA_FROM_CLIENT and not (request.json and request.json.get("use_local_data") == 0): file = request.files['file'] filename = os.path.join(job_utils.get_job_directory(job_id), 'fate_upload_tmp', file.filename) os.makedirs(os.path.dirname(filename), exist_ok=True) try: file.save(filename) except Exception as e: shutil.rmtree(os.path.join(job_utils.get_job_directory(job_id), 'fate_upload_tmp')) raise e job_config = request.args.to_dict() if "namespace" in job_config and "table_name" in job_config: pass else: # higher than version 1.5.1, support eggroll run parameters job_config = json_loads(list(job_config.keys())[0]) job_config['file'] = filename else: job_config = request.json required_arguments = ['work_mode', 'namespace', 'table_name'] if access_module == 'upload': required_arguments.extend(['file', 'head', 'partition']) elif access_module == 'download': required_arguments.extend(['output_path']) else: raise Exception('can not support this operating: {}'.format(access_module)) detect_utils.check_config(job_config, required_arguments=required_arguments) data = {} # compatibility if "table_name" in job_config: job_config["name"] = job_config["table_name"] if "backend" not in job_config: job_config["backend"] = 0 for _ in ["work_mode", "backend", "head", "partition", "drop"]: if _ in job_config: job_config[_] = int(job_config[_]) if access_module == "upload": if job_config.get('drop', 0) == 1: job_config["destroy"] = True else: job_config["destroy"] = False data['table_name'] = job_config["table_name"] data['namespace'] = job_config["namespace"] data_table_meta = storage.StorageTableMeta(name=job_config["table_name"], namespace=job_config["namespace"]) if data_table_meta and not job_config["destroy"]: return get_json_result(retcode=100, retmsg='The data table already exists.' 'If you still want to continue uploading, please add the parameter -drop.' ' 0 means not to delete and continue uploading, ' '1 means to upload again after deleting the table') job_dsl, job_runtime_conf = gen_data_access_job_config(job_config, access_module) submit_result = DAGScheduler.submit({'job_dsl': job_dsl, 'job_runtime_conf': job_runtime_conf}, job_id=job_id) data.update(submit_result) return get_json_result(job_id=job_id, data=data)
def save_table_meta(self, meta): schedule_logger(self.job_id).info(f'start save table meta:{meta}') address = storage.StorageTableMeta.create_address(storage_engine=meta.get("engine"), address_dict=meta.get("address")) table_meta = storage.StorageTableMeta(name=meta.get("name"), namespace=meta.get("namespace"), new=True) table_meta.set_metas(**meta) meta["address"] = address meta["part_of_data"] = deserialize_b64(meta["part_of_data"]) meta["schema"] = deserialize_b64(meta["schema"]) table_meta.create() schedule_logger(self.job_id).info(f'save table meta success')
def get_input_data_min_partitions(input_data, role, party_id): min_partition = None if role != 'arbiter': for data_type, data_location in input_data[role][party_id].items(): table_info = { 'name': data_location.split('.')[1], 'namespace': data_location.split('.')[0] } table_meta = storage.StorageTableMeta( name=table_info['name'], namespace=table_info['namespace']) if table_meta: table_partition = table_meta.get_partitions() if not min_partition or min_partition > table_partition: min_partition = table_partition return min_partition
def table_add(): request_data = request.json detect_utils.check_config(request_data, required_arguments=[ "engine", "address", "namespace", "name", ("head", (0, 1)), "id_delimiter" ]) address_dict = request_data.get('address') engine = request_data.get('engine') name = request_data.get('name') namespace = request_data.get('namespace') address = storage.StorageTableMeta.create_address( storage_engine=engine, address_dict=address_dict) in_serialized = request_data.get( "in_serialized", 1 if engine in { storage.StorageEngine.STANDALONE, storage.StorageEngine.EGGROLL, storage.StorageEngine.MYSQL } else 0) destroy = (int(request_data.get("drop", 0)) == 1) data_table_meta = storage.StorageTableMeta(name=name, namespace=namespace) if data_table_meta: if destroy: data_table_meta.destroy_metas() else: return get_json_result( retcode=100, retmsg='The data table already exists.' 'If you still want to continue uploading, please add the parameter -drop.' '1 means to add again after deleting the table') id_name = request_data.get("id_name") feature_name = request_data.get("feature_name") schema = None if id_name and feature_name: schema = {'header': feature_name, 'sid': id_name} with storage.Session.build( storage_engine=engine, options=request_data.get("options")) as storage_session: storage_session.create_table( address=address, name=name, namespace=namespace, partitions=request_data.get('partitions', None), hava_head=request_data.get("head"), id_delimiter=request_data.get("id_delimiter"), in_serialized=in_serialized, schema=schema) return get_json_result(data={"table_name": name, "namespace": namespace})
def table_disable(): request_data = request.json adapter_request_data(request_data) disable = True if request.url.endswith("disable") else False tables_meta = storage.StorageTableMeta.query_table_meta(filter_fields=dict( **request_data)) data = [] if tables_meta: for table_meta in tables_meta: storage.StorageTableMeta( name=table_meta.f_name, namespace=table_meta.f_namespace).update_metas(disable=disable) data.append({ 'table_name': table_meta.f_name, 'namespace': table_meta.f_namespace }) return get_json_result(data=data) return get_json_result(retcode=101, retmsg='no find table')
def table_download(): request_data = request.json from fate_flow.component_env_utils.env_utils import import_component_output_depend import_component_output_depend() data_table_meta = storage.StorageTableMeta( name=request_data.get("name"), namespace=request_data.get("namespace")) if not data_table_meta: return error_response( response_code=210, retmsg= f'no found table:{request_data.get("namespace")}, {request_data.get("name")}' ) tar_file_name = 'table_{}_{}.tar.gz'.format(request_data.get("namespace"), request_data.get("name")) return TableStorage.send_table( output_tables_meta={"table": data_table_meta}, tar_file_name=tar_file_name, need_head=request_data.get("head", True))
def get_upload_info(jobs_run_conf): data = [] for job_id, job_run_conf in jobs_run_conf.items(): info = {} table_name = job_run_conf["name"] namespace = job_run_conf["namespace"] table_meta = storage.StorageTableMeta(name=table_name, namespace=namespace) if table_meta: partition = job_run_conf["partition"] info["upload_info"] = { "table_name": table_name, "namespace": namespace, "partition": partition, 'upload_count': table_meta.get_count() } info["notes"] = job_run_conf["notes"] info["schema"] = table_meta.get_schema() data.append({job_id: info}) return data
def get_table_meta(self, table_name, table_namespace): request_body = {"table_name": table_name, "namespace": table_namespace} response = api_utils.local_api(job_id=self.job_id, method='POST', endpoint='/tracker/{}/{}/{}/{}/{}/{}/table_meta/get'.format( self.job_id, self.component_name, self.task_id, self.task_version, self.role, self.party_id), json_body=request_body) if response['retcode'] != RetCode.SUCCESS: raise Exception(f"create table meta failed:{response['retmsg']}") else: data_table_meta = storage.StorageTableMeta(name=table_name, namespace=table_namespace, new=True) data_table_meta.set_metas(**response["data"]) data_table_meta.address = storage.StorageTableMeta.create_address(storage_engine=response["data"].get("engine"), address_dict=response["data"].get("address")) data_table_meta.part_of_data = deserialize_b64(data_table_meta.part_of_data) data_table_meta.schema = deserialize_b64(data_table_meta.schema) return data_table_meta
def table_api(table_func): config = request.json if table_func == 'table_info': table_key_count = 0 table_partition = None table_schema = None table_name, namespace = config.get("name") or config.get( "table_name"), config.get("namespace") table_meta = storage.StorageTableMeta(name=table_name, namespace=namespace) address = None enable = True origin = None if table_meta: table_key_count = table_meta.get_count() table_partition = table_meta.get_partitions() table_schema = table_meta.get_schema() address = table_meta.get_address().__dict__ enable = not table_meta.get_disable() origin = table_meta.get_origin() exist = 1 else: exist = 0 return get_json_result( data={ "table_name": table_name, "namespace": namespace, "exist": exist, "count": table_key_count, "partition": table_partition, "schema": table_schema, "enable": enable, "origin": origin, "address": address, }) else: return get_json_result()
def component_check(cls, job, check_type="inheritance"): if check_type == "rerun": task_list = JobSaver.query_task(job_id=job.f_job_id, party_id=job.f_party_id, role=job.f_role, status=TaskStatus.SUCCESS, only_latest=True) tasks = {} for task in task_list: tasks[task.f_component_name] = task else: tasks = JobController.load_tasks( component_list=job.f_inheritance_info.get( "component_list", []), job_id=job.f_inheritance_info.get("job_id"), role=job.f_role, party_id=job.f_party_id) tracker_dict = JobController.load_task_tracker(tasks) missing_dependence_component_list = [] # data dependence for tracker in tracker_dict.values(): table_infos = tracker.get_output_data_info() for table in table_infos: table_meta = storage.StorageTableMeta( name=table.f_table_name, namespace=table.f_table_namespace) if not table_meta: missing_dependence_component_list.append( tracker.component_name) continue if check_type == "rerun": return missing_dependence_component_list elif check_type == "inheritance": # reload component list return list( set(job.f_inheritance_info.get("component_list", [])) - set(missing_dependence_component_list))
def get_task_run_args(cls, job_id, role, party_id, task_id, task_version, job_args, job_parameters: RunParameters, task_parameters: RunParameters, input_dsl, filter_type=None, filter_attr=None, get_input_table=False): task_run_args = {} input_table = {} if 'idmapping' in role: return {} for input_type, input_detail in input_dsl.items(): if filter_type and input_type not in filter_type: continue if input_type == 'data': this_type_args = task_run_args[input_type] = task_run_args.get( input_type, {}) for data_type, data_list in input_detail.items(): data_dict = {} for data_key in data_list: data_key_item = data_key.split('.') data_dict[data_key_item[0]] = {data_type: []} for data_key in data_list: data_key_item = data_key.split('.') search_component_name, search_data_name = data_key_item[ 0], data_key_item[1] storage_table_meta = None if search_component_name == 'args': if job_args.get( 'data', {}).get(search_data_name).get( 'namespace', '') and job_args.get( 'data', {}).get(search_data_name).get( 'name', ''): storage_table_meta = storage.StorageTableMeta( name=job_args['data'][search_data_name] ['name'], namespace=job_args['data'] [search_data_name]['namespace']) else: tracker_client = TrackerClient( job_id=job_id, role=role, party_id=party_id, component_name=search_component_name) upstream_output_table_infos_json = tracker_client.get_output_data_info( data_name=search_data_name) if upstream_output_table_infos_json: tracker = Tracker( job_id=job_id, role=role, party_id=party_id, component_name=search_component_name) upstream_output_table_infos = [] for _ in upstream_output_table_infos_json: upstream_output_table_infos.append( fill_db_model_object( Tracker.get_dynamic_db_model( TrackingOutputDataInfo, job_id)(), _)) output_tables_meta = tracker.get_output_data_table( output_data_infos= upstream_output_table_infos) if output_tables_meta: storage_table_meta = output_tables_meta.get( search_data_name, None) args_from_component = this_type_args[ search_component_name] = this_type_args.get( search_component_name, {}) if get_input_table and storage_table_meta: input_table[data_key] = { 'namespace': storage_table_meta.get_namespace(), 'name': storage_table_meta.get_name() } computing_table = None elif storage_table_meta: LOGGER.info( f"load computing table use {task_parameters.computing_partitions}" ) computing_table = session.get_latest_opened( ).computing.load( storage_table_meta.get_address(), schema=storage_table_meta.get_schema(), partitions=task_parameters.computing_partitions ) else: computing_table = None if not computing_table or not filter_attr or not filter_attr.get( "data", None): data_dict[search_component_name][data_type].append( computing_table) args_from_component[data_type] = data_dict[ search_component_name][data_type] else: args_from_component[data_type] = dict([ (a, getattr(computing_table, "get_{}".format(a))()) for a in filter_attr["data"] ]) elif input_type in ['model', 'isometric_model']: this_type_args = task_run_args[input_type] = task_run_args.get( input_type, {}) for dsl_model_key in input_detail: dsl_model_key_items = dsl_model_key.split('.') if len(dsl_model_key_items) == 2: search_component_name, search_model_alias = dsl_model_key_items[ 0], dsl_model_key_items[1] elif len(dsl_model_key_items ) == 3 and dsl_model_key_items[0] == 'pipeline': search_component_name, search_model_alias = dsl_model_key_items[ 1], dsl_model_key_items[2] else: raise Exception( 'get input {} failed'.format(input_type)) models = Tracker( job_id=job_id, role=role, party_id=party_id, component_name=search_component_name, model_id=job_parameters.model_id, model_version=job_parameters.model_version ).get_output_model(model_alias=search_model_alias) this_type_args[search_component_name] = models if get_input_table: return input_table return task_run_args
def save_output_data(self, computing_table, output_storage_engine, output_storage_address: dict, output_table_namespace=None, output_table_name=None): if computing_table: if not output_table_namespace or not output_table_name: output_table_namespace, output_table_name = data_utils.default_output_table_info( task_id=self.task_id, task_version=self.task_version) schedule_logger(self.job_id).info( 'persisting the component output temporary table to {} {}'. format(output_table_namespace, output_table_name)) partitions = computing_table.partitions schedule_logger(self.job_id).info( 'output data table partitions is {}'.format(partitions)) address_dict = output_storage_address.copy() if output_storage_engine == StorageEngine.EGGROLL: address_dict.update({ "name": output_table_name, "namespace": output_table_namespace, "storage_type": storage.EggRollStorageType.ROLLPAIR_LMDB }) elif output_storage_engine == StorageEngine.STANDALONE: address_dict.update({ "name": output_table_name, "namespace": output_table_namespace, "storage_type": storage.StandaloneStorageType.ROLLPAIR_LMDB }) elif output_storage_engine == StorageEngine.HDFS: address_dict.update({ "path": data_utils.default_output_fs_path( name=output_table_name, namespace=output_table_namespace, prefix=address_dict.get("path_prefix")) }) else: raise RuntimeError( f"{output_storage_engine} storage is not supported") address = storage.StorageTableMeta.create_address( storage_engine=output_storage_engine, address_dict=address_dict) schema = {} # persistent table computing_table.save(address, schema=schema, partitions=partitions) part_of_data = [] part_of_limit = 100 for k, v in computing_table.collect(): part_of_data.append((k, v)) part_of_limit -= 1 if part_of_limit == 0: break table_count = computing_table.count() table_meta = storage.StorageTableMeta( name=output_table_name, namespace=output_table_namespace, new=True) table_meta.address = address table_meta.partitions = computing_table.partitions table_meta.engine = output_storage_engine table_meta.type = storage.EggRollStorageType.ROLLPAIR_LMDB table_meta.schema = schema table_meta.part_of_data = part_of_data table_meta.count = table_count table_meta.create() return output_table_namespace, output_table_name else: schedule_logger(self.job_id).info( 'task id {} output data table is none'.format(self.task_id)) return None, None
def table_bind(): request_data = request.json address_dict = request_data.get('address') engine = request_data.get('engine') name = request_data.get('name') namespace = request_data.get('namespace') address = storage.StorageTableMeta.create_address( storage_engine=engine, address_dict=address_dict) in_serialized = request_data.get( "in_serialized", 1 if engine in { storage.StorageEngine.STANDALONE, storage.StorageEngine.EGGROLL, storage.StorageEngine.MYSQL, storage.StorageEngine.PATH } else 0) destroy = (int(request_data.get("drop", 0)) == 1) data_table_meta = storage.StorageTableMeta(name=name, namespace=namespace) if data_table_meta: if destroy: data_table_meta.destroy_metas() else: return get_json_result( retcode=100, retmsg='The data table already exists.' 'If you still want to continue uploading, please add the parameter --drop' ) id_column = request_data.get("id_column") or request_data.get("id_name") feature_column = request_data.get("feature_column") or request_data.get( "feature_name") schema = None if id_column and feature_column: schema = {'header': feature_column, 'sid': id_column} elif id_column: schema = {'sid': id_column, 'header': ''} sess = Session() storage_session = sess.storage(storage_engine=engine, options=request_data.get("options")) table = storage_session.create_table( address=address, name=name, namespace=namespace, partitions=request_data.get('partitions', None), hava_head=request_data.get("head"), schema=schema, id_delimiter=request_data.get("id_delimiter"), in_serialized=in_serialized, origin=request_data.get("origin", StorageTableOrigin.TABLE_BIND)) response = get_json_result(data={ "table_name": name, "namespace": namespace }) if not table.check_address(): response = get_json_result( retcode=100, retmsg=f'engine {engine} address {address_dict} check failed') else: DataTableTracker.create_table_tracker( table_name=name, table_namespace=namespace, entity_info={"have_parent": False}, ) sess.destroy_all_sessions() return response
def get_table_info(name, namespace): data_table_meta = storage.StorageTableMeta(name=name, namespace=namespace) address = data_table_meta.get_address() schema = data_table_meta.get_schema() return address, schema