def run_job(self, job_id, config): default_runtime_dict = file_utils.load_json_conf('workflow/conf/default_runtime_conf.json') setting_conf = file_utils.load_json_conf('workflow/conf/setting_conf.json') _job_dir = get_job_directory(job_id=job_id) os.makedirs(_job_dir, exist_ok=True) ParameterOverride.override_parameter(default_runtime_dict, setting_conf, config, _job_dir) logger.info('job_id {} parameters overrode {}'.format(config, _job_dir)) channel, stub = get_proxy_data_channel() for runtime_conf_path in glob.glob(os.path.join(_job_dir, '**', 'runtime_conf.json'), recursive=True): runtime_conf = file_utils.load_json_conf(os.path.abspath(runtime_conf_path)) _role = runtime_conf['local']['role'] _party_id = runtime_conf['local']['party_id'] _method = 'POST' _module = runtime_conf['module'] _url = '/workflow/{}/{}/{}'.format(job_id, _module, _role) _packet = wrap_grpc_packet(runtime_conf, _method, _url, _party_id, job_id) logger.info( 'Starting workflow job_id:{} party_id:{} role:{} method:{} url:{}'.format(job_id, _party_id, _role, _method, _url)) try: _return = stub.unaryCall(_packet) logger.info("Grpc unary response: {}".format(_return)) except grpc.RpcError as e: msg = 'job_id:{} party_id:{} role:{} method:{} url:{} Failed to start workflow'.format(job_id, _party_id, _role, _method, _url) logger.exception(msg) return get_json_result(-101, 'UnaryCall submit to remote manager failed')
def init(): if EggRoll.init_flag: return config = file_utils.load_json_conf('arch/conf/mock_roll.json') egg_ids = config.get('eggs') for egg_id in egg_ids: target = config.get('storage').get(egg_id) channel = grpc.insecure_channel( target, options=[('grpc.max_send_message_length', -1), ('grpc.max_receive_message_length', -1)]) EggRoll.egg_list.append(kv_pb2_grpc.KVServiceStub(channel)) procs = config.get('procs').get(egg_id) for proc in procs: _channel = grpc.insecure_channel( proc, options=[('grpc.max_send_message_length', -1), ('grpc.max_receive_message_length', -1)]) _stub = processor_pb2_grpc.ProcessServiceStub(_channel) proc_info = (_channel, _stub) i = len(EggRoll.proc_list) EggRoll.proc_egg_map[i] = int(egg_id) - 1 EggRoll.proc_list.append(proc_info) EggRoll.init_flag = True
def init_roll_site_context(runtime_conf, session_id): from eggroll.roll_site.roll_site import RollSiteContext from eggroll.roll_pair.roll_pair import RollPairContext LOGGER.info("init_roll_site_context runtime_conf: {}".format(runtime_conf)) session_instance = FateSession.get_instance()._eggroll.get_session() rp_context = RollPairContext(session_instance) role = runtime_conf.get("local").get("role") party_id = str(runtime_conf.get("local").get("party_id")) _path = file_utils.get_project_base_directory( ) + "/arch/conf/server_conf.json" server_conf = file_utils.load_json_conf(_path) host = server_conf.get('servers').get('proxy').get("host") port = server_conf.get('servers').get('proxy').get("port") options = { 'self_role': role, 'self_party_id': party_id, 'proxy_endpoint': ErEndpoint(host, int(port)) } rs_context = RollSiteContext(session_id, rp_ctx=rp_context, options=options) LOGGER.info("init_roll_site_context done: {}".format(rs_context.__dict__)) return rp_context, rs_context
def call_back(status): global job_id global role global party_id global LOCAL_URL if job_id is None: parser = argparse.ArgumentParser() parser.add_argument('-j', '--job_id', type=str, required=True, help="Specify the jobid") parser.add_argument('-c', '--config', required=True, type=str, help="Specify a config json file path") args = parser.parse_args() job_id = args.job_id config = file_utils.load_json_conf(args.config) role = config.get('local', {}).get('role') party_id = config.get('local', {}).get('party_id') try: requests.post("/".join( [LOCAL_URL, str(job_id), str(role), str(party_id)]), json={"status": status}) except: LOGGER.info("fail to post status {}".format(status))
def build_federation(self, federation_id, runtime_conf, server_conf_path): if self._work_mode.is_standalone(): from arch.api.impl.based_1x.federation_standalone import FederationRuntime return FederationRuntime(session_id=federation_id, runtime_conf=runtime_conf) elif self._work_mode.is_cluster(): from arch.api.impl.based_1x.federation_cluster import FederationRuntime server_conf = file_utils.load_json_conf(server_conf_path) if CONF_KEY_SERVER not in server_conf: raise EnvironmentError( "server_conf should contain key {}".format( CONF_KEY_SERVER)) if CONF_KEY_FEDERATION not in server_conf.get(CONF_KEY_SERVER): raise EnvironmentError( "The {} should be a json file containing key: {}".format( server_conf_path, CONF_KEY_FEDERATION)) host = server_conf.get(CONF_KEY_SERVER).get( CONF_KEY_FEDERATION).get("host") port = server_conf.get(CONF_KEY_SERVER).get( CONF_KEY_FEDERATION).get("port") return FederationRuntime(session_id=federation_id, runtime_conf=runtime_conf, host=host, port=port)
def __init__(self, job_id, party_id, role, runtime_conf): self.trans_conf = file_utils.load_json_conf('federatedml/transfer_variable_conf/transfer_conf.json') self.job_id = job_id self.party_id = party_id self.role = role self.runtime_conf = runtime_conf self._loop = asyncio.get_event_loop() FederationRuntime.instance = self
def init(job_id=None, server_conf_path="arch/conf/server_conf.json"): if job_id is None: job_id = str(uuid.uuid1()) global LOGGER LOGGER = getLogger() server_conf = file_utils.load_json_conf(server_conf_path) _roll_host = server_conf.get("servers").get("roll").get("host") _roll_port = server_conf.get("servers").get("roll").get("port") _EggRoll(job_id, _roll_host, _roll_port)
def run_job(self, job_id, config): default_runtime_dict = file_utils.load_json_conf( 'workflow/conf/default_runtime_conf.json') setting_conf = file_utils.load_json_conf( 'workflow/conf/setting_conf.json') _job_dir = get_job_directory(job_id=job_id) os.makedirs(_job_dir, exist_ok=True) ParameterOverride.override_parameter(default_runtime_dict, setting_conf, config, _job_dir) logger.info('job_id {} parameters overrode {}'.format( config, _job_dir)) run_job_success = True job_param = dict() job_param['job_id'] = job_id job_param['initiator'] = PARTY_ID for runtime_conf_path in glob.glob(os.path.join( _job_dir, '**', 'runtime_conf.json'), recursive=True): runtime_conf = file_utils.load_json_conf( os.path.abspath(runtime_conf_path)) runtime_conf['JobParam'] = job_param _role = runtime_conf['local']['role'] _party_id = runtime_conf['local']['party_id'] _module = runtime_conf['module'] st, msg = federated_api(job_id=job_id, method='POST', url='/workflow/{}/{}/{}'.format( job_id, _module, _role), party_id=_party_id, json_body=runtime_conf) if st == 0: save_job_info(job_id=job_id, role=_role, party_id=_party_id, save_info={ "status": "ready", "initiator": PARTY_ID }, create=True) else: run_job_success = False logger.info("run job done") return run_job_success
def query_model_version_history(): request_data = request.json try: config = file_utils.load_json_conf(request_data.get("config_path")) eggroll.init(mode=WORK_MODE) history = version_history(data_table_namespace=config.get("namespace")) return get_json_result(msg=json.dumps(history)) except Exception as e: logger.exception(e) return get_json_result(status=1, msg="load model error: %s" % e)
def publish_model_online(): request_data = request.json try: config = file_utils.load_json_conf(request_data.get("config_path")) if not config.get('servings'): # get my party all servings config['servings'] = SERVINGS publish_model.publish_online(config_data=config) return get_json_result() except Exception as e: logger.exception(e) return get_json_result(status=1, msg="publish model error: %s" % e)
def __init__(self, job_id, party_id, role, runtime_conf, host, port): self.trans_conf = file_utils.load_json_conf('federatedml/transfer_variable_conf/transfer_conf.json') self.job_id = job_id self.party_id = party_id self.role = role self.runtime_conf = runtime_conf self.channel = grpc.insecure_channel( target="{}:{}".format(host, port), options=[('grpc.max_send_message_length', -1), ('grpc.max_receive_message_length', -1)]) self.stub = federation_pb2_grpc.TransferSubmitServiceStub(self.channel) self.__pool = concurrent.futures.ThreadPoolExecutor() FederationRuntime.instance = self
def __init__(self, transfer_conf_path): self.transfer_auth = {} for path, _, file_names in os.walk( os.path.join(file_utils.get_project_base_directory(), transfer_conf_path)): for name in file_names: transfer_conf = os.path.join(path, name) if transfer_conf.endswith(".json"): self.transfer_auth.update( file_utils.load_json_conf(transfer_conf)) # cache self._authorized_src = {} self._authorized_dst = {}
def init(job_id, runtime_conf, server_conf_path): global LOGGER LOGGER = getLogger() server_conf = file_utils.load_json_conf(server_conf_path) if CONF_KEY_SERVER not in server_conf: raise EnvironmentError("server_conf should contain key {}".format(CONF_KEY_SERVER)) if CONF_KEY_FEDERATION not in server_conf.get(CONF_KEY_SERVER): raise EnvironmentError( "The {} should be a json file containing key: {}".format(server_conf_path, CONF_KEY_FEDERATION)) _host = server_conf.get(CONF_KEY_SERVER).get(CONF_KEY_FEDERATION).get("host") _port = server_conf.get(CONF_KEY_SERVER).get(CONF_KEY_FEDERATION).get("port") if CONF_KEY_LOCAL not in runtime_conf: raise EnvironmentError("runtime_conf should be a dict containing key: {}".format(CONF_KEY_LOCAL)) _party_id = runtime_conf.get(CONF_KEY_LOCAL).get("party_id") _role = runtime_conf.get(CONF_KEY_LOCAL).get("role") return FederationRuntime(job_id, _party_id, _role, runtime_conf, _host, _port)
def start_workflow(job_id, module, role): _config = request.json _job_dir = get_job_directory(job_id) _party_id = str(_config['local']['party_id']) _method = _config['WorkFlowParam']['method'] default_runtime_dict = file_utils.load_json_conf( 'workflow/conf/default_runtime_conf.json') fill_runtime_conf_table_info(runtime_conf=_config, default_runtime_conf=default_runtime_dict) conf_file_path = new_runtime_conf(job_dir=_job_dir, method=_method, module=module, role=role, party_id=_party_id) with open(conf_file_path, 'w+') as f: f.truncate() f.write(json.dumps(_config, indent=4)) f.flush() progs = [ "python3", os.path.join(file_utils.get_project_base_directory(), _config['CodePath']), "-j", job_id, "-c", os.path.abspath(conf_file_path) ] p = run_subprocess(job_dir=_job_dir, job_role=role, progs=progs) job_status = "start" job_data = dict() job_data["begin_date"] = datetime.datetime.now() job_data["status"] = job_status job_data.update(_config) job_data["pid"] = p.pid job_data["all_party"] = json.dumps(_config.get("role", {})) job_data["initiator"] = _config.get("JobParam", {}).get("initiator") save_job_info(job_id=job_id, role=_config.get("local", {}).get("role"), party_id=_config.get("local", {}).get("party_id"), save_info=job_data, create=True) update_job_queue(job_id=job_id, role=role, party_id=_party_id, save_data={ "status": job_status, "pid": p.pid }) return get_json_result(data={'pid': p.pid}, job_id=job_id)
def init(job_id, runtime_conf, server_conf_path): server_conf = file_utils.load_json_conf(server_conf_path) if CONF_KEY_SERVER not in server_conf: raise EnvironmentError( "server_conf should contain key {}".format(CONF_KEY_SERVER)) if CONF_KEY_FEDERATION not in server_conf.get(CONF_KEY_SERVER): raise EnvironmentError( "The {} should be a json file containing key: {}".format( server_conf_path, CONF_KEY_FEDERATION)) _host = server_conf.get(CONF_KEY_SERVER).get(CONF_KEY_FEDERATION).get( "host") _port = server_conf.get(CONF_KEY_SERVER).get(CONF_KEY_FEDERATION).get( "port") federation_runtime = federation.init(job_id, runtime_conf, server_conf_path) return FateScript(federation_runtime, _host, _port)
def stop_job(job_id): _job_dir = get_job_directory(job_id) for runtime_conf_path in glob.glob(os.path.join(_job_dir, '**', 'runtime_conf.json'), recursive=True): runtime_conf = file_utils.load_json_conf(os.path.abspath(runtime_conf_path)) _role = runtime_conf['local']['role'] _party_id = runtime_conf['local']['party_id'] _url = '/workflow/{}'.format(job_id) _method = 'DELETE' _packet = wrap_grpc_packet({}, _method, _url, _party_id, job_id) channel, stub = get_proxy_data_channel() try: _return = stub.unaryCall(_packet) logger.info("Grpc unary response: {}".format(_return)) except grpc.RpcError as e: msg = 'job_id:{} party_id:{} role:{} method:{} url:{} Failed to start workflow'.format(job_id, _party_id, _role, _method, _url) logger.exception(msg) return get_json_result(-101, 'UnaryCall stop to remote manager failed') return get_json_result()
def test_component(self, fun): job_id = os.listdir(os.path.abspath(os.path.join( self.success_job_dir)))[-1] job_info = file_utils.load_json_conf( os.path.abspath(os.path.join(self.success_job_dir, job_id))) data = { 'job_id': job_id, 'role': job_info['f_role'], 'party_id': job_info['f_party_id'], 'component_name': self.test_component_name } if 'download' in fun: response = requests.get("/".join([self.server_url, "tracking", fun]), json=data, stream=True) self.assertTrue(response.status_code in [200, 201]) else: response = requests.post("/".join([self.server_url, 'tracking', fun]), json=data) self.assertTrue(response.status_code in [200, 201]) self.assertTrue(int(response.json()['retcode']) == 0)
def stop_job(job_id): _job_dir = get_job_directory(job_id) all_party = [] for runtime_conf_path in glob.glob(os.path.join(_job_dir, '**', 'runtime_conf.json'), recursive=True): runtime_conf = file_utils.load_json_conf( os.path.abspath(runtime_conf_path)) for _role, _party_ids in runtime_conf['role'].items(): all_party.extend([(_role, _party_id) for _party_id in _party_ids]) all_party = set(all_party) logger.info('start send stop job to {}'.format(','.join( [i[0] for i in all_party]))) _method = 'DELETE' for _role, _party_id in all_party: federated_api(job_id=job_id, method=_method, url='/workflow/{}/{}/{}'.format(job_id, _role, _party_id), party_id=_party_id) return get_json_result(job_id=job_id)
def load_model(): config = file_utils.load_json_conf(request.json.get("config_path")) _job_id = generate_job_id() channel, stub = get_proxy_data_channel() for _party_id in config.get("party_ids"): config['my_party_id'] = _party_id _method = 'POST' _url = '/model/load/do' _packet = wrap_grpc_packet(config, _method, _url, _party_id, _job_id) logger.info( 'Starting load model job_id:{} party_id:{} method:{} url:{}'.format(_job_id, _party_id,_method, _url)) try: _return = stub.unaryCall(_packet) logger.info("Grpc unary response: {}".format(_return)) except grpc.RpcError as e: msg = 'job_id:{} party_id:{} method:{} url:{} Failed to start load model'.format(_job_id, _party_id, _method, _url) logger.exception(msg) return get_json_result(-101, 'UnaryCall submit to remote manager failed') return get_json_result()
def init_federation(session_id, work_mode, runtime_conf, server_conf_path) -> Federation: if work_mode.is_standalone(): from .standalone import FederationRuntime return FederationRuntime(session_id, runtime_conf) elif work_mode.is_cluster(): from .cluster import FederationRuntime server_conf = file_utils.load_json_conf(server_conf_path) if CONF_KEY_SERVER not in server_conf: raise EnvironmentError( "server_conf should contain key {}".format(CONF_KEY_SERVER)) if CONF_KEY_FEDERATION not in server_conf.get(CONF_KEY_SERVER): raise EnvironmentError( "The {} should be a json file containing key: {}".format( server_conf_path, CONF_KEY_FEDERATION)) host = server_conf.get(CONF_KEY_SERVER).get(CONF_KEY_FEDERATION).get( "host") port = server_conf.get(CONF_KEY_SERVER).get(CONF_KEY_FEDERATION).get( "port") return FederationRuntime(session_id, runtime_conf, host, port) else: raise EnvironmentError(f"{work_mode} unknown")
'passwd': 'fate_dev', 'host': 'mysql', 'port': 3306, 'max_connections': 100, 'stale_timeout': 30, } REDIS = { 'host': 'redis', 'port': 6379, 'password': '******', 'max_connections': 500 } REDIS_QUEUE_DB_INDEX = 0 JOB_MODULE_CONF = file_utils.load_json_conf("fate_flow/job_module_conf.json") """ Services """ server_conf = file_utils.load_json_conf("arch/conf/server_conf.json") PROXY_HOST = server_conf.get(SERVERS).get('proxy').get('host') PROXY_PORT = server_conf.get(SERVERS).get('proxy').get('port') BOARD_HOST = server_conf.get(SERVERS).get('fateboard').get('host') if BOARD_HOST == 'localhost': BOARD_HOST = get_lan_ip() BOARD_PORT = server_conf.get(SERVERS).get('fateboard').get('port') SERVINGS = server_conf.get(SERVERS).get('servings') BOARD_DASHBOARD_URL = 'http://%s:%d/index.html#/dashboard?job_id={}&role={}&party_id={}' % (BOARD_HOST, BOARD_PORT) RuntimeConfig.init_config(WORK_MODE=WORK_MODE) RuntimeConfig.init_config(HTTP_PORT=HTTP_PORT)
def run_task(): task = Task() task.f_create_time = current_timestamp() try: parser = argparse.ArgumentParser() parser.add_argument('-j', '--job_id', required=True, type=str, help="job id") parser.add_argument('-n', '--component_name', required=True, type=str, help="component name") parser.add_argument('-t', '--task_id', required=True, type=str, help="task id") parser.add_argument('-r', '--role', required=True, type=str, help="role") parser.add_argument('-p', '--party_id', required=True, type=str, help="party id") parser.add_argument('-c', '--config', required=True, type=str, help="task config") parser.add_argument('--processors_per_node', help="processors_per_node", type=int) parser.add_argument('--job_server', help="job server", type=str) args = parser.parse_args() schedule_logger(args.job_id).info('enter task process') schedule_logger(args.job_id).info(args) # init function args if args.job_server: RuntimeConfig.init_config(HTTP_PORT=args.job_server.split(':')[1]) RuntimeConfig.set_process_role(ProcessRole.EXECUTOR) job_id = args.job_id component_name = args.component_name task_id = args.task_id role = args.role party_id = int(args.party_id) executor_pid = os.getpid() task_config = file_utils.load_json_conf(args.config) job_parameters = task_config['job_parameters'] job_initiator = task_config['job_initiator'] job_args = task_config['job_args'] task_input_dsl = task_config['input'] task_output_dsl = task_config['output'] component_parameters = TaskExecutor.get_parameters(job_id, component_name, role, party_id) task_parameters = task_config['task_parameters'] module_name = task_config['module_name'] TaskExecutor.monkey_patch() except Exception as e: traceback.print_exc() schedule_logger().exception(e) task.f_status = TaskStatus.FAILED return try: job_log_dir = os.path.join(job_utils.get_job_log_directory(job_id=job_id), role, str(party_id)) task_log_dir = os.path.join(job_log_dir, component_name) log_utils.LoggerFactory.set_directory(directory=task_log_dir, parent_log_dir=job_log_dir, append_to_parent_log=True, force=True) task.f_job_id = job_id task.f_component_name = component_name task.f_task_id = task_id task.f_role = role task.f_party_id = party_id task.f_operator = 'python_operator' tracker = Tracking(job_id=job_id, role=role, party_id=party_id, component_name=component_name, task_id=task_id, model_id=job_parameters['model_id'], model_version=job_parameters['model_version'], component_module_name=module_name) task.f_start_time = current_timestamp() task.f_run_ip = get_lan_ip() task.f_run_pid = executor_pid run_class_paths = component_parameters.get('CodePath').split('/') run_class_package = '.'.join(run_class_paths[:-2]) + '.' + run_class_paths[-2].replace('.py', '') run_class_name = run_class_paths[-1] task.f_status = TaskStatus.RUNNING TaskExecutor.sync_task_status(job_id=job_id, component_name=component_name, task_id=task_id, role=role, party_id=party_id, initiator_party_id=job_initiator.get('party_id', None), initiator_role=job_initiator.get('role', None), task_info=task.to_json()) # init environment, process is shared globally RuntimeConfig.init_config(WORK_MODE=job_parameters['work_mode'], BACKEND=job_parameters.get('backend', 0)) if args.processors_per_node and args.processors_per_node > 0 and RuntimeConfig.BACKEND == Backend.EGGROLL: session_options = {"eggroll.session.processors.per.node": args.processors_per_node} else: session_options = {} session.init(job_id=job_utils.generate_session_id(task_id, role, party_id), mode=RuntimeConfig.WORK_MODE, backend=RuntimeConfig.BACKEND, options=session_options) federation.init(job_id=task_id, runtime_conf=component_parameters) schedule_logger().info('run {} {} {} {} {} task'.format(job_id, component_name, task_id, role, party_id)) schedule_logger().info(component_parameters) schedule_logger().info(task_input_dsl) task_run_args = TaskExecutor.get_task_run_args(job_id=job_id, role=role, party_id=party_id, task_id=task_id, job_args=job_args, job_parameters=job_parameters, task_parameters=task_parameters, input_dsl=task_input_dsl, if_save_as_task_input_data=job_parameters.get("save_as_task_input_data", SAVE_AS_TASK_INPUT_DATA_SWITCH) ) run_object = getattr(importlib.import_module(run_class_package), run_class_name)() run_object.set_tracker(tracker=tracker) run_object.set_taskid(taskid=task_id) run_object.run(component_parameters, task_run_args) output_data = run_object.save_data() tracker.save_output_data_table(output_data, task_output_dsl.get('data')[0] if task_output_dsl.get('data') else 'component') output_model = run_object.export_model() # There is only one model output at the current dsl version. tracker.save_output_model(output_model, task_output_dsl['model'][0] if task_output_dsl.get('model') else 'default') task.f_status = TaskStatus.COMPLETE except Exception as e: task.f_status = TaskStatus.FAILED schedule_logger().exception(e) finally: sync_success = False try: task.f_end_time = current_timestamp() task.f_elapsed = task.f_end_time - task.f_start_time task.f_update_time = current_timestamp() TaskExecutor.sync_task_status(job_id=job_id, component_name=component_name, task_id=task_id, role=role, party_id=party_id, initiator_party_id=job_initiator.get('party_id', None), initiator_role=job_initiator.get('role', None), task_info=task.to_json()) sync_success = True except Exception as e: traceback.print_exc() schedule_logger().exception(e) schedule_logger().info('task {} {} {} start time: {}'.format(task_id, role, party_id, timestamp_to_date(task.f_start_time))) schedule_logger().info('task {} {} {} end time: {}'.format(task_id, role, party_id, timestamp_to_date(task.f_end_time))) schedule_logger().info('task {} {} {} takes {}s'.format(task_id, role, party_id, int(task.f_elapsed)/1000)) schedule_logger().info( 'finish {} {} {} {} {} {} task'.format(job_id, component_name, task_id, role, party_id, task.f_status if sync_success else TaskStatus.FAILED)) print('finish {} {} {} {} {} {} task'.format(job_id, component_name, task_id, role, party_id, task.f_status if sync_success else TaskStatus.FAILED))
audit_logger = log_utils.audit_logger() """ Services """ IP = get_base_config("fate_flow", {}).get("host", "0.0.0.0") HTTP_PORT = get_base_config("fate_flow", {}).get("http_port") GRPC_PORT = get_base_config("fate_flow", {}).get("grpc_port") # standalone job will be send to the standalone job server when FATE-Flow work on cluster deploy mode, # but not the port for FATE-Flow on standalone deploy mode. CLUSTER_STANDALONE_JOB_SERVER_PORT = 9381 # services ip and port SERVER_CONF_PATH = 'arch/conf/server_conf.json' SERVING_PATH = '/servers/servings' server_conf = file_utils.load_json_conf(SERVER_CONF_PATH) PROXY_HOST = server_conf.get(SERVERS).get('proxy').get('host') PROXY_PORT = server_conf.get(SERVERS).get('proxy').get('port') BOARD_HOST = server_conf.get(SERVERS).get('fateboard').get('host') if BOARD_HOST == 'localhost': BOARD_HOST = get_lan_ip() BOARD_PORT = server_conf.get(SERVERS).get('fateboard').get('port') MANAGER_HOST = server_conf.get(SERVERS).get('fatemanager', {}).get('host') MANAGER_PORT = server_conf.get(SERVERS).get('fatemanager', {}).get('port') SERVINGS = CenterConfig.get_settings(path=SERVING_PATH, servings_zk_path=SERVINGS_ZK_PATH, use_zk=USE_CONFIGURATION_CENTER, hosts=ZOOKEEPER_HOSTS, server_conf_path=SERVER_CONF_PATH) BOARD_DASHBOARD_URL = 'http://%s:%d/index.html#/dashboard?job_id={}&role={}&party_id={}' % ( BOARD_HOST, BOARD_PORT)
def __init__(self, fed_obj): super().__init__(fed_obj.job_id, fed_obj.party_id, fed_obj.role, fed_obj.runtime_conf) self.trans_conf = file_utils.load_json_conf('contrib/fate_script/conf/FateScriptTransferVar.json') self.encrypt_operator = None
def init_conf(self, role): conf_path = file_utils.load_json_conf('contrib/fate_script/conf/' + str(role) + '_runtime_conf.json') self.iter_num = conf_path.get("FATEScriptLRParam").get("iter_num") self.batch_num = conf_path.get("FATEScriptLRParam").get("batch_num") self.learning_rate = conf_path.get("FATEScriptLRParam").get("learning_rate") self.eps = conf_path.get("FATEScriptLRParam").get("eps")
def get_settings_from_file(path, server_conf_path): server_conf = file_utils.load_json_conf(server_conf_path) data = server_conf for k in path.split('/')[1:]: data = data.get(k, None) return data
def get_job_conf(job_id): conf_dict = {} for key, path in get_job_conf_path(job_id).items(): config = file_utils.load_json_conf(path) conf_dict[key] = config return conf_dict
def run_task(): task = Task() task.f_create_time = current_timestamp() try: parser = argparse.ArgumentParser() parser.add_argument('-j', '--job_id', required=True, type=str, help="job id") parser.add_argument('-n', '--component_name', required=True, type=str, help="component name") parser.add_argument('-t', '--task_id', required=True, type=str, help="task id") parser.add_argument('-r', '--role', required=True, type=str, help="role") parser.add_argument('-p', '--party_id', required=True, type=str, help="party id") parser.add_argument('-c', '--config', required=True, type=str, help="task config") parser.add_argument('--job_server', help="job server", type=str) args = parser.parse_args() schedule_logger.info('enter task process') schedule_logger.info(args) # init function args if args.job_server: RuntimeConfig.init_config( HTTP_PORT=args.job_server.split(':')[1]) job_id = args.job_id component_name = args.component_name task_id = args.task_id role = args.role party_id = int(args.party_id) task_config = file_utils.load_json_conf(args.config) job_parameters = task_config['job_parameters'] job_initiator = task_config['job_initiator'] job_args = task_config['job_args'] task_input_dsl = task_config['input'] task_output_dsl = task_config['output'] parameters = task_config['parameters'] module_name = task_config['module_name'] except Exception as e: schedule_logger.exception(e) task.f_status = TaskStatus.FAILED return try: # init environment, process is shared globally RuntimeConfig.init_config(WORK_MODE=job_parameters['work_mode']) storage.init_storage(job_id=task_id, work_mode=RuntimeConfig.WORK_MODE) federation.init(job_id=task_id, runtime_conf=parameters) job_log_dir = os.path.join( job_utils.get_job_log_directory(job_id=job_id), role, str(party_id)) task_log_dir = os.path.join(job_log_dir, component_name) log_utils.LoggerFactory.set_directory(directory=task_log_dir, parent_log_dir=job_log_dir, append_to_parent_log=True, force=True) task.f_job_id = job_id task.f_component_name = component_name task.f_task_id = task_id task.f_role = role task.f_party_id = party_id task.f_operator = 'python_operator' tracker = Tracking(job_id=job_id, role=role, party_id=party_id, component_name=component_name, task_id=task_id, model_id=job_parameters['model_id'], model_version=job_parameters['model_version'], module_name=module_name) task.f_start_time = current_timestamp() task.f_run_ip = get_lan_ip() task.f_run_pid = os.getpid() run_class_paths = parameters.get('CodePath').split('/') run_class_package = '.'.join( run_class_paths[:-2]) + '.' + run_class_paths[-2].replace( '.py', '') run_class_name = run_class_paths[-1] task_run_args = TaskExecutor.get_task_run_args( job_id=job_id, role=role, party_id=party_id, job_parameters=job_parameters, job_args=job_args, input_dsl=task_input_dsl) run_object = getattr(importlib.import_module(run_class_package), run_class_name)() run_object.set_tracker(tracker=tracker) run_object.set_taskid(taskid=task_id) task.f_status = TaskStatus.RUNNING TaskExecutor.sync_task_status(job_id=job_id, component_name=component_name, task_id=task_id, role=role, party_id=party_id, initiator_party_id=job_initiator.get( 'party_id', None), task_info=task.to_json()) schedule_logger.info('run {} {} {} {} {} task'.format( job_id, component_name, task_id, role, party_id)) schedule_logger.info(parameters) schedule_logger.info(task_input_dsl) run_object.run(parameters, task_run_args) if task_output_dsl: if task_output_dsl.get('data', []): output_data = run_object.save_data() tracker.save_output_data_table( output_data, task_output_dsl.get('data')[0]) if task_output_dsl.get('model', []): output_model = run_object.export_model() # There is only one model output at the current dsl version. tracker.save_output_model(output_model, task_output_dsl['model'][0]) task.f_status = TaskStatus.SUCCESS except Exception as e: schedule_logger.exception(e) task.f_status = TaskStatus.FAILED finally: try: task.f_end_time = current_timestamp() task.f_elapsed = task.f_end_time - task.f_start_time task.f_update_time = current_timestamp() TaskExecutor.sync_task_status( job_id=job_id, component_name=component_name, task_id=task_id, role=role, party_id=party_id, initiator_party_id=job_initiator.get('party_id', None), task_info=task.to_json()) except Exception as e: schedule_logger.exception(e) schedule_logger.info('finish {} {} {} {} {} {} task'.format( job_id, component_name, task_id, role, party_id, task.f_status)) print('finish {} {} {} {} {} {} task'.format(job_id, component_name, task_id, role, party_id, task.f_status))
import os import tarfile import traceback from contextlib import closing import time import re import requests from requests_toolbelt import MultipartEncoder, MultipartEncoderMonitor from arch.api.utils import file_utils from arch.api.utils.core_utils import get_lan_ip from fate_flow.settings import SERVERS, ROLE, API_VERSION from fate_flow.utils import detect_utils server_conf = file_utils.load_json_conf("arch/conf/server_conf.json") JOB_OPERATE_FUNC = ["submit_job", "stop_job", "query_job", "data_view_query", "clean_job", "clean_queue"] JOB_FUNC = ["job_config", "job_log"] TASK_OPERATE_FUNC = ["query_task"] TRACKING_FUNC = ["component_parameters", "component_metric_all", "component_metric_delete", "component_metrics", "component_output_model", "component_output_data", "component_output_data_table"] DATA_FUNC = ["download", "upload", "upload_history"] TABLE_FUNC = ["table_info", "table_delete"] MODEL_FUNC = ["load", "bind", "store", "restore", "export", "import"] PERMISSION_FUNC = ["grant_privilege", "delete_privilege", "query_privilege"] def prettify(response, verbose=True): if verbose: print(json.dumps(response, indent=4, ensure_ascii=False)) print()
_ONE_DAY_IN_SECONDS = 60 * 60 * 24 DEFAULT_GRPC_OVERALL_TIMEOUT = 60 * 1000 # ms HEADERS = { 'Content-Type': 'application/json', } IP = '0.0.0.0' GRPC_PORT = 9360 HTTP_PORT = 9380 PARTY_ID = 9999 WORK_MODE = 0 LOCAL_URL = "http://localhost:{}".format(HTTP_PORT) DATABASE = { 'engine': 'mysql', 'name': 'task_manager', 'user': '******', 'passwd': 'root1234', 'host': '127.0.0.1', 'port': 3306, 'max_connections': 500, 'stale_timeout': 30, } server_conf = file_utils.load_json_conf("arch/conf/server_conf.json") PROXY_HOST = server_conf.get(SERVERS).get('proxy').get('host') PROXY_PORT = server_conf.get(SERVERS).get('proxy').get('port') SERVINGS = server_conf.get(SERVERS).get('servings') JOB_MODULE_CONF = file_utils.load_json_conf("arch/task_manager/job_module_conf.json")