def init_config(**kwargs): for k, v in kwargs.items(): if hasattr(RuntimeConfig, k): setattr(RuntimeConfig, k, v) if k == 'HTTP_PORT': setattr( RuntimeConfig, 'JOB_SERVER_HOST', "{}:{}".format(get_lan_ip(), RuntimeConfig.HTTP_PORT))
def run_do(self): try: running_tasks = job_utils.query_task(status='running', run_ip=get_lan_ip()) stop_job_ids = set() # detect_logger.info('start to detect running job..') for task in running_tasks: try: process_exist = job_utils.check_job_process( int(task.f_run_pid)) if not process_exist: detect_logger.info( 'job {} component {} on {} {} task {} {} process does not exist' .format(task.f_job_id, task.f_component_name, task.f_role, task.f_party_id, task.f_task_id, task.f_run_pid)) stop_job_ids.add(task.f_job_id) except Exception as e: detect_logger.exception(e) if stop_job_ids: schedule_logger().info( 'start to stop jobs: {}'.format(stop_job_ids)) for job_id in stop_job_ids: jobs = job_utils.query_job(job_id=job_id) if jobs: initiator_party_id = jobs[0].f_initiator_party_id job_work_mode = jobs[0].f_work_mode if len(jobs) > 1: # i am initiator my_party_id = initiator_party_id else: my_party_id = jobs[0].f_party_id initiator_party_id = jobs[0].f_initiator_party_id api_utils.federated_api( job_id=job_id, method='POST', endpoint='/{}/job/stop'.format(API_VERSION), src_party_id=my_party_id, dest_party_id=initiator_party_id, src_role=None, json_body={ 'job_id': job_id, 'operate': 'kill' }, work_mode=job_work_mode) TaskScheduler.finish_job(job_id=job_id, job_runtime_conf=json_loads( jobs[0].f_runtime_conf), stop=True) except Exception as e: detect_logger.exception(e) finally: detect_logger.info('finish detect running job')
def init(hosts, use_configuation_center, fate_flow_zk_path, fate_flow_port, model_transfer_path): if use_configuation_center: zk = CenterConfig.get_zk(hosts) zk.start() model_host = 'http://{}:{}{}'.format(get_lan_ip(), fate_flow_port, model_transfer_path) fate_flow_zk_path = '{}/{}'.format( fate_flow_zk_path, parse.quote(model_host, safe=' ')) try: zk.create(fate_flow_zk_path, makepath=True) except: pass zk.stop()
def register(): if get_base_config("use_registry", False): zk = ServiceUtils.get_zk() zk.start() model_transfer_url = 'http://{}:{}{}'.format( get_lan_ip(), HTTP_PORT, FATE_FLOW_MODEL_TRANSFER_ENDPOINT) fate_flow_model_transfer_service = '{}/{}'.format( FATE_SERVICES_REGISTERED_PATH.get("fateflow", ""), parse.quote(model_transfer_url, safe=' ')) try: zk.create(fate_flow_model_transfer_service, makepath=True, ephemeral=True) stat_logger.info("register path {} to {}".format( fate_flow_model_transfer_service, ";".join(get_base_config("zookeeper", {}).get("hosts")))) except Exception as e: stat_logger.exception(e)
def call_fun(func, config_data, dsl_path, config_path): ip = server_conf.get(SERVERS).get(ROLE).get('host') if ip in ['localhost', '127.0.0.1']: ip = get_lan_ip() http_port = server_conf.get(SERVERS).get(ROLE).get('http.port') server_url = "http://{}:{}/{}".format(ip, http_port, API_VERSION) if func in JOB_OPERATE_FUNC: if func == 'submit_job': if not config_path: raise Exception('the following arguments are required: {}'.format('runtime conf path')) dsl_data = {} if dsl_path or config_data.get('job_parameters', {}).get('job_type', '') == 'predict': if dsl_path: dsl_path = os.path.abspath(dsl_path) with open(dsl_path, 'r') as f: dsl_data = json.load(f) else: raise Exception('the following arguments are required: {}'.format('dsl path')) post_data = {'job_dsl': dsl_data, 'job_runtime_conf': config_data} response = requests.post("/".join([server_url, "job", func.rstrip('_job')]), json=post_data) try: if response.json()['retcode'] == 999: start_cluster_standalone_job_server() response = requests.post("/".join([server_url, "job", func.rstrip('_job')]), json=post_data) except: pass elif func == 'data_view_query' or func == 'clean_queue': response = requests.post("/".join([server_url, "job", func.replace('_', '/')]), json=config_data) else: if func != 'query_job': detect_utils.check_config(config=config_data, required_arguments=['job_id']) post_data = config_data response = requests.post("/".join([server_url, "job", func.rstrip('_job')]), json=post_data) if func == 'query_job': response = response.json() if response['retcode'] == 0: for i in range(len(response['data'])): del response['data'][i]['f_runtime_conf'] del response['data'][i]['f_dsl'] elif func in JOB_FUNC: if func == 'job_config': detect_utils.check_config(config=config_data, required_arguments=['job_id', 'role', 'party_id', 'output_path']) response = requests.post("/".join([server_url, func.replace('_', '/')]), json=config_data) response_data = response.json() if response_data['retcode'] == 0: job_id = response_data['data']['job_id'] download_directory = os.path.join(config_data['output_path'], 'job_{}_config'.format(job_id)) os.makedirs(download_directory, exist_ok=True) for k, v in response_data['data'].items(): if k == 'job_id': continue with open('{}/{}.json'.format(download_directory, k), 'w') as fw: json.dump(v, fw, indent=4) del response_data['data']['dsl'] del response_data['data']['runtime_conf'] response_data['directory'] = download_directory response_data['retmsg'] = 'download successfully, please check {} directory'.format(download_directory) response = response_data elif func == 'job_log': detect_utils.check_config(config=config_data, required_arguments=['job_id', 'output_path']) job_id = config_data['job_id'] tar_file_name = 'job_{}_log.tar.gz'.format(job_id) extract_dir = os.path.join(config_data['output_path'], 'job_{}_log'.format(job_id)) with closing(requests.get("/".join([server_url, func.replace('_', '/')]), json=config_data, stream=True)) as response: if response.status_code == 200: download_from_request(http_response=response, tar_file_name=tar_file_name, extract_dir=extract_dir) response = {'retcode': 0, 'directory': extract_dir, 'retmsg': 'download successfully, please check {} directory'.format(extract_dir)} else: response = response.json() elif func in TASK_OPERATE_FUNC: response = requests.post("/".join([server_url, "job", "task", func.rstrip('_task')]), json=config_data) elif func in TRACKING_FUNC: if func != 'component_metric_delete': detect_utils.check_config(config=config_data, required_arguments=['job_id', 'component_name', 'role', 'party_id']) if func == 'component_output_data': detect_utils.check_config(config=config_data, required_arguments=['output_path']) tar_file_name = 'job_{}_{}_{}_{}_output_data.tar.gz'.format(config_data['job_id'], config_data['component_name'], config_data['role'], config_data['party_id']) extract_dir = os.path.join(config_data['output_path'], tar_file_name.replace('.tar.gz', '')) with closing(requests.get("/".join([server_url, "tracking", func.replace('_', '/'), 'download']), json=config_data, stream=True)) as response: if response.status_code == 200: try: download_from_request(http_response=response, tar_file_name=tar_file_name, extract_dir=extract_dir) response = {'retcode': 0, 'directory': extract_dir, 'retmsg': 'download successfully, please check {} directory'.format(extract_dir)} except: response = {'retcode': 100, 'retmsg': 'download failed, please check if the parameters are correct'} else: response = response.json() else: response = requests.post("/".join([server_url, "tracking", func.replace('_', '/')]), json=config_data) elif func in DATA_FUNC: if func == 'upload' and config_data.get('use_local_data', 1) != 0: file_name = config_data.get('file') if not os.path.isabs(file_name): file_name = os.path.join(file_utils.get_project_base_directory(), file_name) if os.path.exists(file_name): with open(file_name, 'rb') as fp: data = MultipartEncoder( fields={'file': (os.path.basename(file_name), fp, 'application/octet-stream')} ) tag = [0] def read_callback(monitor): if config_data.get('verbose') == 1: sys.stdout.write("\r UPLOADING:{0}{1}".format("|" * (monitor.bytes_read * 100 // monitor.len), '%.2f%%' % (monitor.bytes_read * 100 // monitor.len))) sys.stdout.flush() if monitor.bytes_read /monitor.len == 1: tag[0] += 1 if tag[0] == 2: sys.stdout.write('\n') data = MultipartEncoderMonitor(data, read_callback) response = requests.post("/".join([server_url, "data", func.replace('_', '/')]), data=data, params=config_data, headers={'Content-Type': data.content_type}) else: raise Exception('The file is obtained from the fate flow client machine, but it does not exist, ' 'please check the path: {}'.format(file_name)) else: response = requests.post("/".join([server_url, "data", func.replace('_', '/')]), json=config_data) try: if response.json()['retcode'] == 999: start_cluster_standalone_job_server() response = requests.post("/".join([server_url, "data", func]), json=config_data) except: pass elif func in TABLE_FUNC: if func == "table_info": detect_utils.check_config(config=config_data, required_arguments=['namespace', 'table_name']) response = requests.post("/".join([server_url, "table", func]), json=config_data) else: response = requests.post("/".join([server_url, "table", func.lstrip('table_')]), json=config_data) elif func in MODEL_FUNC: if func == "import": file_path = config_data["file"] if not os.path.isabs(file_path): file_path = os.path.join(file_utils.get_project_base_directory(), file_path) if os.path.exists(file_path): files = {'file': open(file_path, 'rb')} else: raise Exception('The file is obtained from the fate flow client machine, but it does not exist, ' 'please check the path: {}'.format(file_path)) response = requests.post("/".join([server_url, "model", func]), data=config_data, files=files) elif func == "export": with closing(requests.get("/".join([server_url, "model", func]), json=config_data, stream=True)) as response: if response.status_code == 200: archive_file_name = re.findall("filename=(.+)", response.headers["Content-Disposition"])[0] os.makedirs(config_data["output_path"], exist_ok=True) archive_file_path = os.path.join(config_data["output_path"], archive_file_name) with open(archive_file_path, 'wb') as fw: for chunk in response.iter_content(1024): if chunk: fw.write(chunk) response = {'retcode': 0, 'file': archive_file_path, 'retmsg': 'download successfully, please check {}'.format(archive_file_path)} else: response = response.json() else: response = requests.post("/".join([server_url, "model", func]), json=config_data) elif func in PERMISSION_FUNC: detect_utils.check_config(config=config_data, required_arguments=['src_party_id', 'src_role']) response = requests.post("/".join([server_url, "permission", func.replace('_', '/')]), json=config_data) return response.json() if isinstance(response, requests.models.Response) else response
def run_task(): task = Task() task.f_create_time = current_timestamp() try: parser = argparse.ArgumentParser() parser.add_argument('-j', '--job_id', required=True, type=str, help="job id") parser.add_argument('-n', '--component_name', required=True, type=str, help="component name") parser.add_argument('-t', '--task_id', required=True, type=str, help="task id") parser.add_argument('-r', '--role', required=True, type=str, help="role") parser.add_argument('-p', '--party_id', required=True, type=str, help="party id") parser.add_argument('-c', '--config', required=True, type=str, help="task config") parser.add_argument('--processors_per_node', help="processors_per_node", type=int) parser.add_argument('--job_server', help="job server", type=str) args = parser.parse_args() schedule_logger(args.job_id).info('enter task process') schedule_logger(args.job_id).info(args) # init function args if args.job_server: RuntimeConfig.init_config(HTTP_PORT=args.job_server.split(':')[1]) RuntimeConfig.set_process_role(ProcessRole.EXECUTOR) job_id = args.job_id component_name = args.component_name task_id = args.task_id role = args.role party_id = int(args.party_id) executor_pid = os.getpid() task_config = file_utils.load_json_conf(args.config) job_parameters = task_config['job_parameters'] job_initiator = task_config['job_initiator'] job_args = task_config['job_args'] task_input_dsl = task_config['input'] task_output_dsl = task_config['output'] component_parameters = TaskExecutor.get_parameters(job_id, component_name, role, party_id) task_parameters = task_config['task_parameters'] module_name = task_config['module_name'] TaskExecutor.monkey_patch() except Exception as e: traceback.print_exc() schedule_logger().exception(e) task.f_status = TaskStatus.FAILED return try: job_log_dir = os.path.join(job_utils.get_job_log_directory(job_id=job_id), role, str(party_id)) task_log_dir = os.path.join(job_log_dir, component_name) log_utils.LoggerFactory.set_directory(directory=task_log_dir, parent_log_dir=job_log_dir, append_to_parent_log=True, force=True) task.f_job_id = job_id task.f_component_name = component_name task.f_task_id = task_id task.f_role = role task.f_party_id = party_id task.f_operator = 'python_operator' tracker = Tracking(job_id=job_id, role=role, party_id=party_id, component_name=component_name, task_id=task_id, model_id=job_parameters['model_id'], model_version=job_parameters['model_version'], component_module_name=module_name) task.f_start_time = current_timestamp() task.f_run_ip = get_lan_ip() task.f_run_pid = executor_pid run_class_paths = component_parameters.get('CodePath').split('/') run_class_package = '.'.join(run_class_paths[:-2]) + '.' + run_class_paths[-2].replace('.py', '') run_class_name = run_class_paths[-1] task.f_status = TaskStatus.RUNNING TaskExecutor.sync_task_status(job_id=job_id, component_name=component_name, task_id=task_id, role=role, party_id=party_id, initiator_party_id=job_initiator.get('party_id', None), initiator_role=job_initiator.get('role', None), task_info=task.to_json()) # init environment, process is shared globally RuntimeConfig.init_config(WORK_MODE=job_parameters['work_mode'], BACKEND=job_parameters.get('backend', 0)) if args.processors_per_node and args.processors_per_node > 0 and RuntimeConfig.BACKEND == Backend.EGGROLL: session_options = {"eggroll.session.processors.per.node": args.processors_per_node} else: session_options = {} session.init(job_id=job_utils.generate_session_id(task_id, role, party_id), mode=RuntimeConfig.WORK_MODE, backend=RuntimeConfig.BACKEND, options=session_options) federation.init(job_id=task_id, runtime_conf=component_parameters) schedule_logger().info('run {} {} {} {} {} task'.format(job_id, component_name, task_id, role, party_id)) schedule_logger().info(component_parameters) schedule_logger().info(task_input_dsl) task_run_args = TaskExecutor.get_task_run_args(job_id=job_id, role=role, party_id=party_id, task_id=task_id, job_args=job_args, job_parameters=job_parameters, task_parameters=task_parameters, input_dsl=task_input_dsl, if_save_as_task_input_data=job_parameters.get("save_as_task_input_data", SAVE_AS_TASK_INPUT_DATA_SWITCH) ) run_object = getattr(importlib.import_module(run_class_package), run_class_name)() run_object.set_tracker(tracker=tracker) run_object.set_taskid(taskid=task_id) run_object.run(component_parameters, task_run_args) output_data = run_object.save_data() tracker.save_output_data_table(output_data, task_output_dsl.get('data')[0] if task_output_dsl.get('data') else 'component') output_model = run_object.export_model() # There is only one model output at the current dsl version. tracker.save_output_model(output_model, task_output_dsl['model'][0] if task_output_dsl.get('model') else 'default') task.f_status = TaskStatus.COMPLETE except Exception as e: task.f_status = TaskStatus.FAILED schedule_logger().exception(e) finally: sync_success = False try: task.f_end_time = current_timestamp() task.f_elapsed = task.f_end_time - task.f_start_time task.f_update_time = current_timestamp() TaskExecutor.sync_task_status(job_id=job_id, component_name=component_name, task_id=task_id, role=role, party_id=party_id, initiator_party_id=job_initiator.get('party_id', None), initiator_role=job_initiator.get('role', None), task_info=task.to_json()) sync_success = True except Exception as e: traceback.print_exc() schedule_logger().exception(e) schedule_logger().info('task {} {} {} start time: {}'.format(task_id, role, party_id, timestamp_to_date(task.f_start_time))) schedule_logger().info('task {} {} {} end time: {}'.format(task_id, role, party_id, timestamp_to_date(task.f_end_time))) schedule_logger().info('task {} {} {} takes {}s'.format(task_id, role, party_id, int(task.f_elapsed)/1000)) schedule_logger().info( 'finish {} {} {} {} {} {} task'.format(job_id, component_name, task_id, role, party_id, task.f_status if sync_success else TaskStatus.FAILED)) print('finish {} {} {} {} {} {} task'.format(job_id, component_name, task_id, role, party_id, task.f_status if sync_success else TaskStatus.FAILED))
def run_task(job_id, component_name, task_id, role, party_id, task_config): schedule_logger(job_id).info( 'job {} {} {} {} task subprocess is ready'.format(job_id, component_name, role, party_id, task_config)) task_process_start_status = False try: task_dir = os.path.join(job_utils.get_job_directory(job_id=job_id), role, party_id, component_name) os.makedirs(task_dir, exist_ok=True) task_config_path = os.path.join(task_dir, 'task_config.json') with open(task_config_path, 'w') as fw: json.dump(task_config, fw) try: backend = task_config['job_parameters']['backend'] except KeyError: backend = 0 schedule_logger(job_id).warning("failed to get backend, set as 0") backend = Backend(backend) if backend.is_eggroll(): process_cmd = [ 'python3', sys.modules[TaskExecutor.__module__].__file__, '-j', job_id, '-n', component_name, '-t', task_id, '-r', role, '-p', party_id, '-c', task_config_path, '--processors_per_node', str(task_config['job_parameters'].get("processors_per_node", 0)), '--job_server', '{}:{}'.format(get_lan_ip(), HTTP_PORT), ] elif backend.is_spark(): if "SPARK_HOME" not in os.environ: raise EnvironmentError("SPARK_HOME not found") spark_home = os.environ["SPARK_HOME"] # additional configs spark_submit_config = task_config['job_parameters'].get("spark_submit_config", dict()) deploy_mode = spark_submit_config.get("deploy-mode", "client") if deploy_mode not in ["client"]: raise ValueError(f"deploy mode {deploy_mode} not supported") spark_submit_cmd = os.path.join(spark_home, "bin/spark-submit") process_cmd = [spark_submit_cmd, f'--name={task_id}#{role}'] for k, v in spark_submit_config.items(): if k != "conf": process_cmd.append(f'--{k}={v}') if "conf" in spark_submit_config: for ck, cv in spark_submit_config["conf"].items(): process_cmd.append(f'--conf') process_cmd.append(f'{ck}={cv}') process_cmd.extend([ sys.modules[TaskExecutor.__module__].__file__, '-j', job_id, '-n', component_name, '-t', task_id, '-r', role, '-p', party_id, '-c', task_config_path, '--job_server', '{}:{}'.format(get_lan_ip(), HTTP_PORT), ]) else: raise ValueError(f"${backend} supported") task_log_dir = os.path.join(job_utils.get_job_log_directory(job_id=job_id), role, party_id, component_name) schedule_logger(job_id).info( 'job {} {} {} {} task subprocess start'.format(job_id, component_name, role, party_id, task_config)) p = job_utils.run_subprocess(config_dir=task_dir, process_cmd=process_cmd, log_dir=task_log_dir) if p: task_process_start_status = True except Exception as e: schedule_logger(job_id).exception(e) finally: schedule_logger(job_id).info( 'job {} component {} on {} {} start task subprocess {}'.format(job_id, component_name, role, party_id, 'success' if task_process_start_status else 'failed'))
def run_component(job_id, job_runtime_conf, job_parameters, job_initiator, job_args, dag, component): parameters = component.get_role_parameters() component_name = component.get_name() module_name = component.get_module() task_id = job_utils.generate_task_id(job_id=job_id, component_name=component_name) schedule_logger(job_id).info('job {} run component {}'.format(job_id, component_name)) for role, partys_parameters in parameters.items(): for party_index in range(len(partys_parameters)): party_parameters = partys_parameters[party_index] if role in job_args: party_job_args = job_args[role][party_index]['args'] else: party_job_args = {} dest_party_id = party_parameters.get('local', {}).get('party_id') response = federated_api(job_id=job_id, method='POST', endpoint='/{}/schedule/{}/{}/{}/{}/{}/run'.format( API_VERSION, job_id, component_name, task_id, role, dest_party_id), src_party_id=job_initiator['party_id'], dest_party_id=dest_party_id, src_role=job_initiator['role'], json_body={'job_parameters': job_parameters, 'job_initiator': job_initiator, 'job_args': party_job_args, 'parameters': party_parameters, 'module_name': module_name, 'input': component.get_input(), 'output': component.get_output(), 'job_server': {'ip': get_lan_ip(), 'http_port': RuntimeConfig.HTTP_PORT}}, work_mode=job_parameters['work_mode']) if response['retcode']: if 'not authorized' in response['retmsg']: raise Exception('run component {} not authorized'.format(component_name)) component_task_status = TaskScheduler.check_task_status(job_id=job_id, component=component) job_status = TaskScheduler.check_job_status(job_id) if component_task_status and job_status: task_success = True else: task_success = False schedule_logger(job_id).info( 'job {} component {} run {}'.format(job_id, component_name, 'success' if task_success else 'failed')) # update progress TaskScheduler.sync_job_status(job_id=job_id, roles=job_runtime_conf['role'], work_mode=job_parameters['work_mode'], initiator_party_id=job_initiator['party_id'], initiator_role=job_initiator['role'], job_info=job_utils.update_job_progress(job_id=job_id, dag=dag, current_task_id=task_id).to_json()) TaskScheduler.stop(job_id=job_id, component_name=component_name) if task_success: next_components = dag.get_next_components(component_name) schedule_logger(job_id).info('job {} component {} next components is {}'.format(job_id, component_name, [next_component.get_name() for next_component in next_components])) for next_component in next_components: try: schedule_logger(job_id).info( 'job {} check component {} dependencies status'.format(job_id, next_component.get_name())) dependencies_status = TaskScheduler.check_dependencies(job_id=job_id, dag=dag, component=next_component) job_status = TaskScheduler.check_job_status(job_id) schedule_logger(job_id).info( 'job {} component {} dependencies status is {}, job status is {}'.format(job_id, next_component.get_name(), dependencies_status, job_status)) if dependencies_status and job_status: run_status = TaskScheduler.run_component(job_id, job_runtime_conf, job_parameters, job_initiator, job_args, dag, next_component) else: run_status = False except Exception as e: schedule_logger(job_id).exception(e) run_status = False if not run_status: return False return True else: if component_task_status == None: end_status = JobStatus.TIMEOUT else: end_status = JobStatus.FAILED TaskScheduler.stop(job_id=job_id, end_status=end_status) return False
HTTP_PORT = get_base_config("fate_flow", {}).get("http_port") GRPC_PORT = get_base_config("fate_flow", {}).get("grpc_port") # standalone job will be send to the standalone job server when FATE-Flow work on cluster deploy mode, # but not the port for FATE-Flow on standalone deploy mode. CLUSTER_STANDALONE_JOB_SERVER_PORT = 9381 # services ip and port SERVER_CONF_PATH = 'arch/conf/server_conf.json' SERVING_PATH = '/servers/servings' server_conf = file_utils.load_json_conf(SERVER_CONF_PATH) PROXY_HOST = server_conf.get(SERVERS).get('proxy').get('host') PROXY_PORT = server_conf.get(SERVERS).get('proxy').get('port') BOARD_HOST = server_conf.get(SERVERS).get('fateboard').get('host') if BOARD_HOST == 'localhost': BOARD_HOST = get_lan_ip() BOARD_PORT = server_conf.get(SERVERS).get('fateboard').get('port') MANAGER_HOST = server_conf.get(SERVERS).get('fatemanager', {}).get('host') MANAGER_PORT = server_conf.get(SERVERS).get('fatemanager', {}).get('port') SERVINGS = CenterConfig.get_settings(path=SERVING_PATH, servings_zk_path=SERVINGS_ZK_PATH, use_zk=USE_CONFIGURATION_CENTER, hosts=ZOOKEEPER_HOSTS, server_conf_path=SERVER_CONF_PATH) BOARD_DASHBOARD_URL = 'http://%s:%d/index.html#/dashboard?job_id={}&role={}&party_id={}' % ( BOARD_HOST, BOARD_PORT) # switch SAVE_AS_TASK_INPUT_DATA_SWITCH = True SAVE_AS_TASK_INPUT_DATA_IN_MEMORY = True