def detect_expired_session(cls): ttl = SESSION_VALID_PERIOD detect_logger().info( f'start detect expired session by ttl {ttl/1000} s') try: session_records = Session.query_sessions( create_time=[None, current_timestamp() - ttl]) manager_session_id_list = [] for session_record in session_records: manager_session_id = session_record.f_manager_session_id if manager_session_id not in manager_session_id: continue manager_session_id_list.append(manager_session_id) detect_logger().info( f'start destroy session {manager_session_id}') try: sess = Session(session_id=manager_session_id, options={"logger": detect_logger()}) sess.destroy_all_sessions() except Exception as e: detect_logger().error( f'stop session {manager_session_id} error', e) finally: detect_logger().info( f'stop session {manager_session_id} successfully') except Exception as e: detect_logger().error('detect expired session error', e) finally: detect_logger().info('finish detect expired session')
def table_delete(): request_data = request.json table_name = request_data.get('table_name') namespace = request_data.get('namespace') data = None sess = Session() table = sess.get_table(name=table_name, namespace=namespace, ignore_disable=True) if table: table.destroy() data = {'table_name': table_name, 'namespace': namespace} sess.destroy_all_sessions() if data: return get_json_result(data=data) return get_json_result(retcode=101, retmsg='no find table')
def run(cls): parser = argparse.ArgumentParser() parser.add_argument('--session_id', required=True, type=str, help="session id") parser.add_argument('--storage', help="storage engine", type=str) parser.add_argument('--file', required=True, type=str, help="file path") parser.add_argument('--namespace', required=True, type=str, help="namespace") parser.add_argument('--name', required=True, type=str, help="name") parser.add_argument('--partitions', required=True, type=int, help="partitions") args = parser.parse_args() session_id = args.session_id with Session(session_id=session_id) as sess: storage_session = sess.storage( storage_engine=args.storage ) if args.storage in {StorageEngine.EGGROLL, StorageEngine.STANDALONE}: upload_address = { "name": args.name, "namespace": args.namespace, "storage_type": EggRollStoreType.ROLLPAIR_LMDB, } address = storage.StorageTableMeta.create_address( storage_engine=args.storage, address_dict=upload_address ) table = storage_session.create_table(address=address, name=args.name, namespace=args.namespace, partitions=args.partitions, origin=StorageTableOrigin.UPLOAD) cls.upload(args.file, False, table=table)
def _run(self, cpn_input: ComponentInputProtocol): self.parameters = cpn_input.parameters self.parameters["role"] = cpn_input.roles["role"] self.parameters["local"] = cpn_input.roles["local"] name, namespace = self.parameters.get("name"), self.parameters.get( "namespace") with open(os.path.abspath(self.parameters["output_path"]), "w") as fw: session = Session( job_utils.generate_session_id( self.tracker.task_id, self.tracker.task_version, self.tracker.role, self.tracker.party_id, )) data_table = session.get_table(name=name, namespace=namespace) if not data_table: raise Exception(f"no found table {name} {namespace}") count = data_table.count() LOGGER.info("===== begin to export data =====") lines = 0 job_info = {} job_info["job_id"] = self.tracker.job_id job_info["role"] = self.tracker.role job_info["party_id"] = self.tracker.party_id for key, value in data_table.collect(): if not value: fw.write(key + "\n") else: fw.write(key + self.parameters.get("delimiter", ",") + str(value) + "\n") lines += 1 if lines % 2000 == 0: LOGGER.info("===== export {} lines =====".format(lines)) if lines % 10000 == 0: job_info["progress"] = lines / count * 100 // 1 ControllerClient.update_job(job_info=job_info) job_info["progress"] = 100 ControllerClient.update_job(job_info=job_info) self.callback_metric( metric_name="data_access", metric_namespace="download", metric_data=[Metric("count", data_table.count())], ) LOGGER.info("===== export {} lines totally =====".format(lines)) LOGGER.info("===== export data finish =====") LOGGER.info("===== export data file path:{} =====".format( os.path.abspath(self.parameters["output_path"])))
def table_delete_disable(): request_data = request.json adapter_request_data(request_data) tables_meta = storage.StorageTableMeta.query_table_meta( filter_fields={"disable": True}) data = [] sess = Session() for table_meta in tables_meta: table = sess.get_table(name=table_meta.f_name, namespace=table_meta.f_namespace, ignore_disable=True) if table: table.destroy() data.append({ 'table_name': table_meta.f_name, 'namespace': table_meta.f_namespace }) sess.destroy_all_sessions() if data: return get_json_result(data=data) return get_json_result(retcode=101, retmsg='no find table')
def setUp(self): self.job_id = str(uuid.uuid1()) self.session = Session.create(0, 0).init_computing(self.job_id).computing model = HeteroStepwise() model.__setattr__('role', consts.GUEST) model.__setattr__('fit_intercept', True) self.model = model data_num = 100 feature_num = 5 bool_list = [True, False, True, True, False] self.str_mask = "10110" self.header = ["x1", "x2", "x3", "x4", "x5"] self.mask = self.prepare_mask(bool_list) self.table = self.prepare_data(data_num, feature_num, self.header, "id", "y")
def setUp(self): self.job_id = str(uuid.uuid1()) self.session = Session.create(0, 0).init_computing(self.job_id).computing self.data_splitter = data_split.DataSplitter() param_dict = { "random_state": 42, "test_size": 0.2, "train_size": 0.6, "validate_size": 0.2, "stratified": True, "shuffle": True, "split_points": [0.5, 0.2] } params = DataSplitParam(**param_dict) self.data_splitter._init_model(params)
def setUp(self): self.job_id = str(uuid.uuid1()) self.session = Session.create(0, 0).init_computing(self.job_id).computing data_num = 100 feature_num = 8 self.prepare_data(data_num, feature_num) params = LocalBaselineParam() local_baseline_obj = LocalBaseline() local_baseline_obj._init_model(params) local_baseline_obj.need_run = True local_baseline_obj.header = [ "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8" ] local_baseline_obj.model_name = "LogisticRegression" local_baseline_obj.model_opts = {} self.local_baseline_obj = local_baseline_obj
def delete_tables_by_table_infos(output_data_table_infos): data = [] status = False with Session() as sess: for output_data_table_info in output_data_table_infos: table_name = output_data_table_info.f_table_name namespace = output_data_table_info.f_table_namespace table_info = {'table_name': table_name, 'namespace': namespace} if table_name and namespace and table_info not in data: table = sess.get_table(table_name, namespace) if table: try: table.destroy() data.append(table_info) status = True except: pass return status, data
def apply_func(func, job_id, role, num_hosts, ind, *args): partyid_map = dict(host=[9999 + i for i in range(num_hosts)], guest=[9999], arbiter=[9999]) partyid = 9999 if role == consts.HOST: partyid = 9999 + ind with Session() as session: session.init_computing(job_id, computing_type=ComputingType.STANDALONE) session.init_federation(federation_session_id=job_id, runtime_conf={ "local": { "role": role, "party_id": partyid }, "role": partyid_map }) return func(job_id, role, ind, *args)
def run(cls): parser = argparse.ArgumentParser() parser.add_argument('-j', '--job_id', required=True, type=str, help="job id") parser.add_argument('--computing', help="computing engine", type=str) parser.add_argument('--federation', help="federation engine", type=str) parser.add_argument('--storage', help="storage engine", type=str) parser.add_argument('-c', '--command', required=True, type=str, help="command") args = parser.parse_args() session_job_id = args.job_id fate_job_id = session_job_id.split('_')[0] command = args.command with Session(computing_type=args.computing, federation_type=args.federation) as session: session.init_computing(computing_session_id=session_job_id) try: schedule_logger(fate_job_id).info('start {} session {}'.format( command, session.computing.session_id)) if command == 'stop': session.computing.stop() elif command == 'kill': session.computing.kill() else: schedule_logger(fate_job_id).info( '{} session {} failed, this command is not supported'. format(command, session.computing.session_id)) schedule_logger(fate_job_id).info( '{} session {} success'.format( command, session.computing.session_id)) except Exception as e: schedule_logger().exception(e)
def _call(cls, job_id, role, transfer_variable, num_hosts, ind, *args): role_id = { "host": [ 10000 + i for i in range(num_hosts) ], "guest": [ 9999 ], "arbiter": [ 9999 ] } with Session() as session: session.init_computing(job_id, computing_type=ComputingType.STANDALONE) session.init_federation(job_id, runtime_conf={ "local": { "role": role, "party_id": role_id[role][0] if role != "host" else role_id[role][ind] }, "role": role_id }) return cls.call(role, transfer_variable, ind, *args)
def table_bind(): request_data = request.json address_dict = request_data.get('address') engine = request_data.get('engine') name = request_data.get('name') namespace = request_data.get('namespace') address = storage.StorageTableMeta.create_address( storage_engine=engine, address_dict=address_dict) in_serialized = request_data.get( "in_serialized", 1 if engine in { storage.StorageEngine.STANDALONE, storage.StorageEngine.EGGROLL, storage.StorageEngine.MYSQL, storage.StorageEngine.PATH } else 0) destroy = (int(request_data.get("drop", 0)) == 1) data_table_meta = storage.StorageTableMeta(name=name, namespace=namespace) if data_table_meta: if destroy: data_table_meta.destroy_metas() else: return get_json_result( retcode=100, retmsg='The data table already exists.' 'If you still want to continue uploading, please add the parameter --drop' ) id_column = request_data.get("id_column") or request_data.get("id_name") feature_column = request_data.get("feature_column") or request_data.get( "feature_name") schema = None if id_column and feature_column: schema = {'header': feature_column, 'sid': id_column} elif id_column: schema = {'sid': id_column, 'header': ''} sess = Session() storage_session = sess.storage(storage_engine=engine, options=request_data.get("options")) table = storage_session.create_table( address=address, name=name, namespace=namespace, partitions=request_data.get('partitions', None), hava_head=request_data.get("head"), schema=schema, id_delimiter=request_data.get("id_delimiter"), in_serialized=in_serialized, origin=request_data.get("origin", StorageTableOrigin.TABLE_BIND)) response = get_json_result(data={ "table_name": name, "namespace": namespace }) if not table.check_address(): response = get_json_result( retcode=100, retmsg=f'engine {engine} address {address_dict} check failed') else: DataTableTracker.create_table_tracker( table_name=name, table_namespace=namespace, entity_info={"have_parent": False}, ) sess.destroy_all_sessions() return response
def _run(self, cpn_input: ComponentInputProtocol): self.parameters = cpn_input.parameters if self.parameters.get("namespace") and self.parameters.get("name"): namespace = self.parameters.get("namespace") name = self.parameters.get("name") elif cpn_input.flow_feeded_parameters.get("table_info"): namespace = cpn_input.flow_feeded_parameters.get( "table_info")[0].get("namespace") name = cpn_input.flow_feeded_parameters.get("table_info")[0].get( "name") else: raise Exception("no found name or namespace in input parameters") LOGGER.info(f"writer parameters:{self.parameters}") src_table = Session.get_global().get_table(name=name, namespace=namespace) output_name = self.parameters.get("output_name") output_namespace = self.parameters.get("output_namespace") engine = self.parameters.get("storage_engine") address_dict = self.parameters.get("address") if output_name and output_namespace: table_meta = src_table.meta.to_dict() address_dict = src_table.meta.get_address().__dict__ engine = src_table.meta.get_engine() table_meta.update({ "name": output_name, "namespace": output_namespace, "address": self._create_save_address(engine, address_dict, output_name, output_namespace), }) src_table.save_as(**table_meta) # output table track DataTableTracker.create_table_tracker(name, namespace, entity_info={ "have_parent": True, "parent_table_namespace": namespace, "parent_table_name": name, "job_id": self.tracker.job_id, }) elif engine and address_dict: save_data_to_external_storage(engine, address_dict, src_table) LOGGER.info("save success") self.tracker.log_output_data_info( data_name="writer", table_namespace=output_namespace, table_name=output_name, ) self.tracker.log_metric_data(metric_namespace="writer", metric_name="writer", metrics=[ Metric("count", src_table.meta.get_count()), Metric("storage_engine", engine) ])
def send_table(output_tables_meta, tar_file_name, limit=-1, need_head=True): output_data_file_list = [] output_data_meta_file_list = [] output_tmp_dir = os.path.join( get_fate_flow_directory(), 'tmp/{}/{}'.format(datetime.datetime.now().strftime("%Y%m%d"), fate_uuid())) for output_name, output_table_meta in output_tables_meta.items(): output_data_count = 0 output_data_file_path = "{}/{}.csv".format(output_tmp_dir, output_name) output_data_meta_file_path = "{}/{}.meta".format( output_tmp_dir, output_name) os.makedirs(os.path.dirname(output_data_file_path), exist_ok=True) with open(output_data_file_path, 'w') as fw: with Session() as sess: output_table = sess.get_table( name=output_table_meta.get_name(), namespace=output_table_meta.get_namespace()) if output_table: for k, v in output_table.collect(): data_line, is_str, extend_header = feature_utils.get_component_output_data_line( src_key=k, src_value=v, schema=output_table_meta.get_schema()) # save meta if output_data_count == 0: output_data_file_list.append( output_data_file_path) header = get_component_output_data_schema( output_table_meta=output_table_meta, is_str=is_str, extend_header=extend_header) output_data_meta_file_list.append( output_data_meta_file_path) with open(output_data_meta_file_path, 'w') as f: json.dump({'header': header}, f, indent=4) if need_head and header and output_table_meta.get_have_head( ): fw.write('{}\n'.format(','.join(header))) fw.write('{}\n'.format(','.join( map(lambda x: str(x), data_line)))) output_data_count += 1 if output_data_count == limit: break # tar output_data_tarfile = "{}/{}".format(output_tmp_dir, tar_file_name) tar = tarfile.open(output_data_tarfile, mode='w:gz') for index in range(0, len(output_data_file_list)): tar.add( output_data_file_list[index], os.path.relpath(output_data_file_list[index], output_tmp_dir)) tar.add( output_data_meta_file_list[index], os.path.relpath(output_data_meta_file_list[index], output_tmp_dir)) tar.close() for key, path in enumerate(output_data_file_list): try: os.remove(path) os.remove(output_data_meta_file_list[key]) except Exception as e: # warning stat_logger.warning(e) return send_file(output_data_tarfile, attachment_filename=tar_file_name)
type=str, help="host party id", default='10000') parser.add_argument('-j', '--job_id', required=True, type=str, help="job_id") args = parser.parse_args() job_id = args.job_id guest_id = args.gid host_id = args.hid role = args.role with Session.create(0, 0) as session: session.init_computing(job_id) session.init_federation( federation_session_id=job_id, parties_info=PartiesInfo(local=Party( role, guest_id if role == GUEST else host_id), role_to_parties={ "host": [Party("host", host_id)], "guest": [Party("guest", guest_id)] })) test_obj = TestHeteroFeatureBinning(role, guest_id, host_id) # homo_obj.test_homo_lr() test_obj.test_feature_binning() test_obj.tearDown()
def _run(self, cpn_input: ComponentInputProtocol): self.parameters = cpn_input.parameters self.job_parameters = cpn_input.job_parameters output_storage_address = self.job_parameters.engines_address[ EngineType.STORAGE] # only support one input table table_key = [key for key in self.parameters.keys()][0] input_table_namespace, input_table_name = self.get_input_table_info( parameters=self.parameters[table_key], role=self.tracker.role, party_id=self.tracker.party_id, ) ( output_table_namespace, output_table_name, ) = default_output_info( task_id=self.tracker.task_id, task_version=self.tracker.task_version, output_type="data", ) ( input_table_meta, output_table_address, output_table_engine, ) = self.convert_check( input_name=input_table_name, input_namespace=input_table_namespace, output_name=output_table_name, output_namespace=output_table_namespace, computing_engine=self.job_parameters.computing_engine, output_storage_address=output_storage_address, ) sess = Session.get_global() input_table = sess.get_table( name=input_table_meta.get_name(), namespace=input_table_meta.get_namespace()) # update real count to meta info input_table.count() # Table replication is required if input_table_meta.get_engine() != output_table_engine: LOGGER.info( f"the {input_table_meta.get_engine()} engine input table needs to be converted to {output_table_engine} engine to support computing engine {self.job_parameters.computing_engine}" ) else: LOGGER.info( f"the {input_table_meta.get_engine()} input table needs to be transform format" ) LOGGER.info("reader create storage session2") output_table_session = sess.storage(storage_engine=output_table_engine) output_table = output_table_session.create_table( address=output_table_address, name=output_table_name, namespace=output_table_namespace, partitions=input_table_meta.partitions, origin=StorageTableOrigin.READER) self.save_table(src_table=input_table, dest_table=output_table) # update real count to meta info output_table_meta = StorageTableMeta(name=output_table.name, namespace=output_table.namespace) # todo: may be set output data, and executor support pass persistent self.tracker.log_output_data_info( data_name=cpn_input.flow_feeded_parameters.get("output_data_name") [0] if cpn_input.flow_feeded_parameters.get("output_data_name") else table_key, table_namespace=output_table_meta.get_namespace(), table_name=output_table_meta.get_name(), ) DataTableTracker.create_table_tracker( output_table_meta.get_name(), output_table_meta.get_namespace(), entity_info={ "have_parent": True, "parent_table_namespace": input_table_namespace, "parent_table_name": input_table_name, "job_id": self.tracker.job_id, }, ) headers_str = output_table_meta.get_schema().get("header") table_info = {} if output_table_meta.get_schema() and headers_str: if isinstance(headers_str, str): data_list = [headers_str.split(",")] is_display = True else: data_list = [headers_str] is_display = False if is_display: for data in output_table_meta.get_part_of_data(): data_list.append(data[1].split(",")) data = np.array(data_list) Tdata = data.transpose() for data in Tdata: table_info[data[0]] = ",".join(list(set(data[1:]))[:5]) data_info = { "table_name": input_table_name, "namespace": input_table_namespace, "table_info": table_info, "partitions": output_table_meta.get_partitions(), "storage_engine": output_table_meta.get_engine(), } if input_table_meta.get_engine() in [StorageEngine.PATH]: data_info["file_count"] = output_table_meta.get_count() data_info["file_path"] = input_table_meta.get_address().path else: data_info["count"] = output_table_meta.get_count() self.tracker.set_metric_meta( metric_namespace="reader_namespace", metric_name="reader_name", metric_meta=MetricMeta(name="reader", metric_type="data_info", extra_metas=data_info), )
result_data = selection_guest.save_data() local_data = result_data.collect() print("data in transform") for k, v in local_data: print("k: {}, v: {}".format(k, v.features)) def tearDown(self): self.table.destroy() if __name__ == '__main__': import sys job_id = str(sys.argv[1]) with Session() as session: session.init_computing(job_id, computing_type=ComputingType.STANDALONE) session.init_federation(job_id, runtime_conf={ "local": { "role": "guest", "party_id": 9999 }, "role": { "host": [10000], "guest": [9999] } }) selection_obj = TestHeteroFeatureSelection() selection_obj.test_feature_selection()
def _run(self, cpn_input: ComponentInputProtocol): self.parameters = cpn_input.parameters LOGGER.info(self.parameters) self.parameters["role"] = cpn_input.roles["role"] self.parameters["local"] = cpn_input.roles["local"] storage_engine = self.parameters["storage_engine"].upper() storage_address = self.parameters["storage_address"] # if not set storage, use job storage as default if not storage_engine: storage_engine = cpn_input.job_parameters.storage_engine self.storage_engine = storage_engine if not storage_address: storage_address = cpn_input.job_parameters.engines_address[ EngineType.STORAGE] job_id = self.task_version_id.split("_")[0] if not os.path.isabs(self.parameters.get("file", "")): self.parameters["file"] = os.path.join(get_fate_flow_directory(), self.parameters["file"]) if not os.path.exists(self.parameters["file"]): raise Exception("%s is not exist, please check the configure" % (self.parameters["file"])) if not os.path.getsize(self.parameters["file"]): raise Exception("%s is an empty file" % (self.parameters["file"])) name, namespace = self.parameters.get("name"), self.parameters.get( "namespace") _namespace, _table_name = self.generate_table_name( self.parameters["file"]) if namespace is None: namespace = _namespace if name is None: name = _table_name read_head = self.parameters["head"] if read_head == 0: head = False elif read_head == 1: head = True else: raise Exception("'head' in conf.json should be 0 or 1") partitions = self.parameters["partition"] if partitions <= 0 or partitions >= self.MAX_PARTITIONS: raise Exception( "Error number of partition, it should between %d and %d" % (0, self.MAX_PARTITIONS)) self.session_id = job_utils.generate_session_id( self.tracker.task_id, self.tracker.task_version, self.tracker.role, self.tracker.party_id, ) sess = Session.get_global() self.session = sess if self.parameters.get("destroy", False): table = sess.get_table(namespace=namespace, name=name) if table: LOGGER.info( f"destroy table name: {name} namespace: {namespace} engine: {table.engine}" ) try: table.destroy() except Exception as e: LOGGER.error(e) else: LOGGER.info( f"can not found table name: {name} namespace: {namespace}, pass destroy" ) address_dict = storage_address.copy() storage_session = sess.storage(storage_engine=storage_engine, options=self.parameters.get("options")) upload_address = {} if storage_engine in {StorageEngine.EGGROLL, StorageEngine.STANDALONE}: upload_address = { "name": name, "namespace": namespace, "storage_type": EggRollStoreType.ROLLPAIR_LMDB, } elif storage_engine in {StorageEngine.MYSQL, StorageEngine.HIVE}: if not address_dict.get("db") or not address_dict.get("name"): upload_address = {"db": namespace, "name": name} elif storage_engine in {StorageEngine.PATH}: upload_address = {"path": self.parameters["file"]} elif storage_engine in {StorageEngine.HDFS}: upload_address = { "path": default_input_fs_path( name=name, namespace=namespace, prefix=address_dict.get("path_prefix"), ) } elif storage_engine in {StorageEngine.LOCALFS}: upload_address = { "path": default_input_fs_path(name=name, namespace=namespace, storage_engine=storage_engine) } else: raise RuntimeError( f"can not support this storage engine: {storage_engine}") address_dict.update(upload_address) LOGGER.info( f"upload to {storage_engine} storage, address: {address_dict}") address = storage.StorageTableMeta.create_address( storage_engine=storage_engine, address_dict=address_dict) self.parameters["partitions"] = partitions self.parameters["name"] = name self.table = storage_session.create_table( address=address, origin=StorageTableOrigin.UPLOAD, **self.parameters) if storage_engine not in [StorageEngine.PATH]: data_table_count = self.save_data_table(job_id, name, namespace, storage_engine, head) else: data_table_count = self.get_data_table_count( self.parameters["file"], name, namespace) self.table.meta.update_metas(in_serialized=True) DataTableTracker.create_table_tracker( table_name=name, table_namespace=namespace, entity_info={ "job_id": job_id, "have_parent": False }, ) LOGGER.info("------------load data finish!-----------------") # rm tmp file try: if "{}/fate_upload_tmp".format(job_id) in self.parameters["file"]: LOGGER.info("remove tmp upload file") LOGGER.info(os.path.dirname(self.parameters["file"])) shutil.rmtree(os.path.dirname(self.parameters["file"])) except: LOGGER.info("remove tmp file failed") LOGGER.info("file: {}".format(self.parameters["file"])) LOGGER.info("total data_count: {}".format(data_table_count)) LOGGER.info("table name: {}, table namespace: {}".format( name, namespace))