def init_job_queue(): if RuntimeConfig.WORK_MODE == WorkMode.STANDALONE: job_queue = InProcessQueue() RuntimeConfig.init_config(JOB_QUEUE=job_queue) elif RuntimeConfig.WORK_MODE == WorkMode.CLUSTER: job_queue = RedisQueue(queue_name='fate_flow_job_queue', host=REDIS['host'], port=REDIS['port'], password=REDIS['password'], max_connections=REDIS['max_connections']) RuntimeConfig.init_config(JOB_QUEUE=job_queue) else: raise Exception('init queue failed.')
def __init__(self): database_config = DATABASE.copy() db_name = database_config.pop("name") if WORK_MODE == WorkMode.STANDALONE: self.database_connection = APSWDatabase('fate_flow_sqlite.db') RuntimeConfig.init_config(USE_LOCAL_DATABASE=True) stat_logger.info('init sqlite database on standalone mode successfully') elif WORK_MODE == WorkMode.CLUSTER: self.database_connection = PooledMySQLDatabase(db_name, **database_config) stat_logger.info('init mysql database on cluster mode successfully') RuntimeConfig.init_config(USE_LOCAL_DATABASE=False) else: raise Exception('can not init database')
def run_task(cls): task_info = {} try: parser = argparse.ArgumentParser() parser.add_argument('-j', '--job_id', required=True, type=str, help="job id") parser.add_argument('-n', '--component_name', required=True, type=str, help="component name") parser.add_argument('-t', '--task_id', required=True, type=str, help="task id") parser.add_argument('-v', '--task_version', required=True, type=int, help="task version") parser.add_argument('-r', '--role', required=True, type=str, help="role") parser.add_argument('-p', '--party_id', required=True, type=int, help="party id") parser.add_argument('-c', '--config', required=True, type=str, help="task parameters") parser.add_argument('--run_ip', help="run ip", type=str) parser.add_argument('--job_server', help="job server", type=str) args = parser.parse_args() schedule_logger(args.job_id).info('enter task process') schedule_logger(args.job_id).info(args) # init function args if args.job_server: RuntimeConfig.init_config( JOB_SERVER_HOST=args.job_server.split(':')[0], HTTP_PORT=args.job_server.split(':')[1]) RuntimeConfig.set_process_role(ProcessRole.EXECUTOR) job_id = args.job_id component_name = args.component_name task_id = args.task_id task_version = args.task_version role = args.role party_id = args.party_id executor_pid = os.getpid() task_info.update({ "job_id": job_id, "component_name": component_name, "task_id": task_id, "task_version": task_version, "role": role, "party_id": party_id, "run_ip": args.run_ip, "run_pid": executor_pid }) start_time = current_timestamp() job_conf = job_utils.get_job_conf(job_id, role) job_dsl = job_conf["job_dsl_path"] job_runtime_conf = job_conf["job_runtime_conf_path"] dsl_parser = schedule_utils.get_job_dsl_parser( dsl=job_dsl, runtime_conf=job_runtime_conf, train_runtime_conf=job_conf["train_runtime_conf_path"], pipeline_dsl=job_conf["pipeline_dsl_path"]) party_index = job_runtime_conf["role"][role].index(party_id) job_args_on_party = TaskExecutor.get_job_args_on_party( dsl_parser, job_runtime_conf, role, party_id) component = dsl_parser.get_component_info( component_name=component_name) component_parameters = component.get_role_parameters() component_parameters_on_party = component_parameters[role][ party_index] if role in component_parameters else {} module_name = component.get_module() task_input_dsl = component.get_input() task_output_dsl = component.get_output() component_parameters_on_party[ 'output_data_name'] = task_output_dsl.get('data') task_parameters = RunParameters( **file_utils.load_json_conf(args.config)) job_parameters = task_parameters if job_parameters.assistant_role: TaskExecutor.monkey_patch() except Exception as e: traceback.print_exc() schedule_logger().exception(e) task_info["party_status"] = TaskStatus.FAILED return try: job_log_dir = os.path.join( job_utils.get_job_log_directory(job_id=job_id), role, str(party_id)) task_log_dir = os.path.join(job_log_dir, component_name) log.LoggerFactory.set_directory(directory=task_log_dir, parent_log_dir=job_log_dir, append_to_parent_log=True, force=True) tracker = Tracker(job_id=job_id, role=role, party_id=party_id, component_name=component_name, task_id=task_id, task_version=task_version, model_id=job_parameters.model_id, model_version=job_parameters.model_version, component_module_name=module_name, job_parameters=job_parameters) tracker_client = TrackerClient( job_id=job_id, role=role, party_id=party_id, component_name=component_name, task_id=task_id, task_version=task_version, model_id=job_parameters.model_id, model_version=job_parameters.model_version, component_module_name=module_name, job_parameters=job_parameters) run_class_paths = component_parameters_on_party.get( 'CodePath').split('/') run_class_package = '.'.join( run_class_paths[:-2]) + '.' + run_class_paths[-2].replace( '.py', '') run_class_name = run_class_paths[-1] task_info["party_status"] = TaskStatus.RUNNING cls.report_task_update_to_driver(task_info=task_info) # init environment, process is shared globally RuntimeConfig.init_config( WORK_MODE=job_parameters.work_mode, COMPUTING_ENGINE=job_parameters.computing_engine, FEDERATION_ENGINE=job_parameters.federation_engine, FEDERATED_MODE=job_parameters.federated_mode) if RuntimeConfig.COMPUTING_ENGINE == ComputingEngine.EGGROLL: session_options = task_parameters.eggroll_run.copy() else: session_options = {} sess = session.Session( computing_type=job_parameters.computing_engine, federation_type=job_parameters.federation_engine) computing_session_id = job_utils.generate_session_id( task_id, task_version, role, party_id) sess.init_computing(computing_session_id=computing_session_id, options=session_options) federation_session_id = job_utils.generate_task_version_id( task_id, task_version) component_parameters_on_party[ "job_parameters"] = job_parameters.to_dict() sess.init_federation( federation_session_id=federation_session_id, runtime_conf=component_parameters_on_party, service_conf=job_parameters.engines_address.get( EngineType.FEDERATION, {})) sess.as_default() schedule_logger().info('Run {} {} {} {} {} task'.format( job_id, component_name, task_id, role, party_id)) schedule_logger().info("Component parameters on party {}".format( component_parameters_on_party)) schedule_logger().info("Task input dsl {}".format(task_input_dsl)) task_run_args = cls.get_task_run_args( job_id=job_id, role=role, party_id=party_id, task_id=task_id, task_version=task_version, job_args=job_args_on_party, job_parameters=job_parameters, task_parameters=task_parameters, input_dsl=task_input_dsl, ) if module_name in {"Upload", "Download", "Reader", "Writer"}: task_run_args["job_parameters"] = job_parameters run_object = getattr(importlib.import_module(run_class_package), run_class_name)() run_object.set_tracker(tracker=tracker_client) run_object.set_task_version_id( task_version_id=job_utils.generate_task_version_id( task_id, task_version)) # add profile logs profile.profile_start() run_object.run(component_parameters_on_party, task_run_args) profile.profile_ends() output_data = run_object.save_data() if not isinstance(output_data, list): output_data = [output_data] for index in range(0, len(output_data)): data_name = task_output_dsl.get( 'data')[index] if task_output_dsl.get( 'data') else '{}'.format(index) persistent_table_namespace, persistent_table_name = tracker.save_output_data( computing_table=output_data[index], output_storage_engine=job_parameters.storage_engine, output_storage_address=job_parameters.engines_address.get( EngineType.STORAGE, {})) if persistent_table_namespace and persistent_table_name: tracker.log_output_data_info( data_name=data_name, table_namespace=persistent_table_namespace, table_name=persistent_table_name) output_model = run_object.export_model() # There is only one model output at the current dsl version. tracker.save_output_model( output_model, task_output_dsl['model'][0] if task_output_dsl.get('model') else 'default') task_info["party_status"] = TaskStatus.SUCCESS except Exception as e: task_info["party_status"] = TaskStatus.FAILED schedule_logger().exception(e) finally: try: task_info["end_time"] = current_timestamp() task_info["elapsed"] = task_info["end_time"] - start_time cls.report_task_update_to_driver(task_info=task_info) except Exception as e: task_info["party_status"] = TaskStatus.FAILED traceback.print_exc() schedule_logger().exception(e) schedule_logger().info('task {} {} {} start time: {}'.format( task_id, role, party_id, timestamp_to_date(start_time))) schedule_logger().info('task {} {} {} end time: {}'.format( task_id, role, party_id, timestamp_to_date(task_info["end_time"]))) schedule_logger().info('task {} {} {} takes {}s'.format( task_id, role, party_id, int(task_info["elapsed"]) / 1000)) schedule_logger().info('Finish {} {} {} {} {} {} task {}'.format( job_id, component_name, task_id, task_version, role, party_id, task_info["party_status"])) print('Finish {} {} {} {} {} {} task {}'.format( job_id, component_name, task_id, task_version, role, party_id, task_info["party_status"])) return task_info
'host': 'mysql', 'port': 3306, 'max_connections': 100, 'stale_timeout': 30, } REDIS = { 'host': 'redis', 'port': 6379, 'password': '******', 'max_connections': 500 } REDIS_QUEUE_DB_INDEX = 0 JOB_MODULE_CONF = file_utils.load_json_conf("fate_flow/job_module_conf.json") """ Services """ server_conf = file_utils.load_json_conf("arch/conf/server_conf.json") PROXY_HOST = server_conf.get(SERVERS).get('proxy').get('host') PROXY_PORT = server_conf.get(SERVERS).get('proxy').get('port') BOARD_HOST = server_conf.get(SERVERS).get('fateboard').get('host') if BOARD_HOST == 'localhost': BOARD_HOST = get_lan_ip() BOARD_PORT = server_conf.get(SERVERS).get('fateboard').get('port') SERVINGS = server_conf.get(SERVERS).get('servings') BOARD_DASHBOARD_URL = 'http://%s:%d/index.html#/dashboard?job_id={}&role={}&party_id={}' % (BOARD_HOST, BOARD_PORT) RuntimeConfig.init_config(WORK_MODE=WORK_MODE) RuntimeConfig.init_config(HTTP_PORT=HTTP_PORT)
def run_task(): task = Task() task.f_create_time = current_timestamp() try: parser = argparse.ArgumentParser() parser.add_argument('-j', '--job_id', required=True, type=str, help="job id") parser.add_argument('-n', '--component_name', required=True, type=str, help="component name") parser.add_argument('-t', '--task_id', required=True, type=str, help="task id") parser.add_argument('-r', '--role', required=True, type=str, help="role") parser.add_argument('-p', '--party_id', required=True, type=str, help="party id") parser.add_argument('-c', '--config', required=True, type=str, help="task config") parser.add_argument('--processors_per_node', help="processors_per_node", type=int) parser.add_argument('--job_server', help="job server", type=str) args = parser.parse_args() schedule_logger(args.job_id).info('enter task process') schedule_logger(args.job_id).info(args) # init function args if args.job_server: RuntimeConfig.init_config(HTTP_PORT=args.job_server.split(':')[1]) RuntimeConfig.set_process_role(ProcessRole.EXECUTOR) job_id = args.job_id component_name = args.component_name task_id = args.task_id role = args.role party_id = int(args.party_id) executor_pid = os.getpid() task_config = file_utils.load_json_conf(args.config) job_parameters = task_config['job_parameters'] job_initiator = task_config['job_initiator'] job_args = task_config['job_args'] task_input_dsl = task_config['input'] task_output_dsl = task_config['output'] component_parameters = TaskExecutor.get_parameters(job_id, component_name, role, party_id) task_parameters = task_config['task_parameters'] module_name = task_config['module_name'] TaskExecutor.monkey_patch() except Exception as e: traceback.print_exc() schedule_logger().exception(e) task.f_status = TaskStatus.FAILED return try: job_log_dir = os.path.join(job_utils.get_job_log_directory(job_id=job_id), role, str(party_id)) task_log_dir = os.path.join(job_log_dir, component_name) log_utils.LoggerFactory.set_directory(directory=task_log_dir, parent_log_dir=job_log_dir, append_to_parent_log=True, force=True) task.f_job_id = job_id task.f_component_name = component_name task.f_task_id = task_id task.f_role = role task.f_party_id = party_id task.f_operator = 'python_operator' tracker = Tracking(job_id=job_id, role=role, party_id=party_id, component_name=component_name, task_id=task_id, model_id=job_parameters['model_id'], model_version=job_parameters['model_version'], component_module_name=module_name) task.f_start_time = current_timestamp() task.f_run_ip = get_lan_ip() task.f_run_pid = executor_pid run_class_paths = component_parameters.get('CodePath').split('/') run_class_package = '.'.join(run_class_paths[:-2]) + '.' + run_class_paths[-2].replace('.py', '') run_class_name = run_class_paths[-1] task.f_status = TaskStatus.RUNNING TaskExecutor.sync_task_status(job_id=job_id, component_name=component_name, task_id=task_id, role=role, party_id=party_id, initiator_party_id=job_initiator.get('party_id', None), initiator_role=job_initiator.get('role', None), task_info=task.to_json()) # init environment, process is shared globally RuntimeConfig.init_config(WORK_MODE=job_parameters['work_mode'], BACKEND=job_parameters.get('backend', 0)) if args.processors_per_node and args.processors_per_node > 0 and RuntimeConfig.BACKEND == Backend.EGGROLL: session_options = {"eggroll.session.processors.per.node": args.processors_per_node} else: session_options = {} session.init(job_id=job_utils.generate_session_id(task_id, role, party_id), mode=RuntimeConfig.WORK_MODE, backend=RuntimeConfig.BACKEND, options=session_options) federation.init(job_id=task_id, runtime_conf=component_parameters) schedule_logger().info('run {} {} {} {} {} task'.format(job_id, component_name, task_id, role, party_id)) schedule_logger().info(component_parameters) schedule_logger().info(task_input_dsl) task_run_args = TaskExecutor.get_task_run_args(job_id=job_id, role=role, party_id=party_id, task_id=task_id, job_args=job_args, job_parameters=job_parameters, task_parameters=task_parameters, input_dsl=task_input_dsl, if_save_as_task_input_data=job_parameters.get("save_as_task_input_data", SAVE_AS_TASK_INPUT_DATA_SWITCH) ) run_object = getattr(importlib.import_module(run_class_package), run_class_name)() run_object.set_tracker(tracker=tracker) run_object.set_taskid(taskid=task_id) run_object.run(component_parameters, task_run_args) output_data = run_object.save_data() tracker.save_output_data_table(output_data, task_output_dsl.get('data')[0] if task_output_dsl.get('data') else 'component') output_model = run_object.export_model() # There is only one model output at the current dsl version. tracker.save_output_model(output_model, task_output_dsl['model'][0] if task_output_dsl.get('model') else 'default') task.f_status = TaskStatus.COMPLETE except Exception as e: task.f_status = TaskStatus.FAILED schedule_logger().exception(e) finally: sync_success = False try: task.f_end_time = current_timestamp() task.f_elapsed = task.f_end_time - task.f_start_time task.f_update_time = current_timestamp() TaskExecutor.sync_task_status(job_id=job_id, component_name=component_name, task_id=task_id, role=role, party_id=party_id, initiator_party_id=job_initiator.get('party_id', None), initiator_role=job_initiator.get('role', None), task_info=task.to_json()) sync_success = True except Exception as e: traceback.print_exc() schedule_logger().exception(e) schedule_logger().info('task {} {} {} start time: {}'.format(task_id, role, party_id, timestamp_to_date(task.f_start_time))) schedule_logger().info('task {} {} {} end time: {}'.format(task_id, role, party_id, timestamp_to_date(task.f_end_time))) schedule_logger().info('task {} {} {} takes {}s'.format(task_id, role, party_id, int(task.f_elapsed)/1000)) schedule_logger().info( 'finish {} {} {} {} {} {} task'.format(job_id, component_name, task_id, role, party_id, task.f_status if sync_success else TaskStatus.FAILED)) print('finish {} {} {} {} {} {} task'.format(job_id, component_name, task_id, role, party_id, task.f_status if sync_success else TaskStatus.FAILED))
'/{}/forward'.format(API_VERSION): forward_app_manager }) # init signal.signal(signal.SIGTERM, job_utils.cleaning) signal.signal(signal.SIGCHLD, job_utils.wait_child_process) init_database_tables() # init runtime config import argparse parser = argparse.ArgumentParser() parser.add_argument('--standalone_node', default=False, help="if standalone node mode or not ", action='store_true') args = parser.parse_args() if args.standalone_node: RuntimeConfig.init_config(WORK_MODE=WorkMode.STANDALONE) RuntimeConfig.init_config(HTTP_PORT=CLUSTER_STANDALONE_JOB_SERVER_PORT) session_utils.init_session_for_flow_server() RuntimeConfig.init_env() RuntimeConfig.set_process_role(ProcessRole.SERVER) queue_manager.init_job_queue() job_controller.JobController.init() history_job_clean = job_controller.JobClean() history_job_clean.start() PrivilegeAuth.init() ServiceUtils.register() # start job detector job_detector.JobDetector(interval=5 * 1000).start() # start scheduler scheduler = dag_scheduler.DAGScheduler( queue=RuntimeConfig.JOB_QUEUE, concurrent_num=MAX_CONCURRENT_JOB_RUN)
def run_task(): task = Task() task.f_create_time = current_timestamp() try: parser = argparse.ArgumentParser() parser.add_argument('-j', '--job_id', required=True, type=str, help="job id") parser.add_argument('-n', '--component_name', required=True, type=str, help="component name") parser.add_argument('-t', '--task_id', required=True, type=str, help="task id") parser.add_argument('-r', '--role', required=True, type=str, help="role") parser.add_argument('-p', '--party_id', required=True, type=str, help="party id") parser.add_argument('-c', '--config', required=True, type=str, help="task config") parser.add_argument('--job_server', help="job server", type=str) args = parser.parse_args() schedule_logger.info('enter task process') schedule_logger.info(args) # init function args if args.job_server: RuntimeConfig.init_config( HTTP_PORT=args.job_server.split(':')[1]) job_id = args.job_id component_name = args.component_name task_id = args.task_id role = args.role party_id = int(args.party_id) task_config = file_utils.load_json_conf(args.config) job_parameters = task_config['job_parameters'] job_initiator = task_config['job_initiator'] job_args = task_config['job_args'] task_input_dsl = task_config['input'] task_output_dsl = task_config['output'] parameters = task_config['parameters'] module_name = task_config['module_name'] except Exception as e: schedule_logger.exception(e) task.f_status = TaskStatus.FAILED return try: # init environment, process is shared globally RuntimeConfig.init_config(WORK_MODE=job_parameters['work_mode']) storage.init_storage(job_id=task_id, work_mode=RuntimeConfig.WORK_MODE) federation.init(job_id=task_id, runtime_conf=parameters) job_log_dir = os.path.join( job_utils.get_job_log_directory(job_id=job_id), role, str(party_id)) task_log_dir = os.path.join(job_log_dir, component_name) log_utils.LoggerFactory.set_directory(directory=task_log_dir, parent_log_dir=job_log_dir, append_to_parent_log=True, force=True) task.f_job_id = job_id task.f_component_name = component_name task.f_task_id = task_id task.f_role = role task.f_party_id = party_id task.f_operator = 'python_operator' tracker = Tracking(job_id=job_id, role=role, party_id=party_id, component_name=component_name, task_id=task_id, model_id=job_parameters['model_id'], model_version=job_parameters['model_version'], module_name=module_name) task.f_start_time = current_timestamp() task.f_run_ip = get_lan_ip() task.f_run_pid = os.getpid() run_class_paths = parameters.get('CodePath').split('/') run_class_package = '.'.join( run_class_paths[:-2]) + '.' + run_class_paths[-2].replace( '.py', '') run_class_name = run_class_paths[-1] task_run_args = TaskExecutor.get_task_run_args( job_id=job_id, role=role, party_id=party_id, job_parameters=job_parameters, job_args=job_args, input_dsl=task_input_dsl) run_object = getattr(importlib.import_module(run_class_package), run_class_name)() run_object.set_tracker(tracker=tracker) run_object.set_taskid(taskid=task_id) task.f_status = TaskStatus.RUNNING TaskExecutor.sync_task_status(job_id=job_id, component_name=component_name, task_id=task_id, role=role, party_id=party_id, initiator_party_id=job_initiator.get( 'party_id', None), task_info=task.to_json()) schedule_logger.info('run {} {} {} {} {} task'.format( job_id, component_name, task_id, role, party_id)) schedule_logger.info(parameters) schedule_logger.info(task_input_dsl) run_object.run(parameters, task_run_args) if task_output_dsl: if task_output_dsl.get('data', []): output_data = run_object.save_data() tracker.save_output_data_table( output_data, task_output_dsl.get('data')[0]) if task_output_dsl.get('model', []): output_model = run_object.export_model() # There is only one model output at the current dsl version. tracker.save_output_model(output_model, task_output_dsl['model'][0]) task.f_status = TaskStatus.SUCCESS except Exception as e: schedule_logger.exception(e) task.f_status = TaskStatus.FAILED finally: try: task.f_end_time = current_timestamp() task.f_elapsed = task.f_end_time - task.f_start_time task.f_update_time = current_timestamp() TaskExecutor.sync_task_status( job_id=job_id, component_name=component_name, task_id=task_id, role=role, party_id=party_id, initiator_party_id=job_initiator.get('party_id', None), task_info=task.to_json()) except Exception as e: schedule_logger.exception(e) schedule_logger.info('finish {} {} {} {} {} {} task'.format( job_id, component_name, task_id, role, party_id, task.f_status)) print('finish {} {} {} {} {} {} task'.format(job_id, component_name, task_id, role, party_id, task.f_status))
# Endpoint FATE_FLOW_MODEL_TRANSFER_ENDPOINT = "/v1/model/transfer" FATE_MANAGER_GET_NODE_INFO_ENDPOINT = "/fate-manager/api/site/secretinfo" FATE_MANAGER_NODE_CHECK_ENDPOINT = "/fate-manager/api/site/checksite" FATE_BOARD_DASHBOARD_ENDPOINT = "/index.html#/dashboard?job_id={}&role={}&party_id={}" # Logger log.LoggerFactory.LEVEL = 10 # {CRITICAL: 50, FATAL:50, ERROR:40, WARNING:30, WARN:30, INFO:20, DEBUG:10, NOTSET:0} log.LoggerFactory.set_directory( os.path.join(file_utils.get_project_base_directory(), "logs", "fate_flow")) stat_logger = log.getLogger("fate_flow_stat") detect_logger = log.getLogger("fate_flow_detect") access_logger = log.getLogger("fate_flow_access") data_manager_logger = log.getLogger("fate_flow_data_manager") peewee_logger = log.getLogger("peewee") # Switch UPLOAD_DATA_FROM_CLIENT = True USE_AUTHENTICATION = False PRIVILEGE_COMMAND_WHITELIST = [] CHECK_NODES_IDENTITY = False DEFAULT_FEDERATED_STATUS_COLLECT_TYPE = get_base_config( FATEFLOW_SERVICE_NAME, {}).get("default_federated_status_collect_type", "PUSH") # Init RuntimeConfig.init_config(WORK_MODE=WORK_MODE) RuntimeConfig.init_config(JOB_SERVER_HOST=IP, HTTP_PORT=HTTP_PORT)
if __name__ == '__main__': manager.url_map.strict_slashes = False app = DispatcherMiddleware( manager, { '/{}/data'.format(API_VERSION): data_access_app_manager, '/{}/model'.format(API_VERSION): model_app_manager, '/{}/job'.format(API_VERSION): job_app_manager, '/{}/table'.format(API_VERSION): table_app_manager, '/{}/tracking'.format(API_VERSION): tracking_app_manager, '/{}/pipeline'.format(API_VERSION): pipeline_app_manager, }) # init signal.signal(signal.SIGCHLD, job_utils.wait_child_process) init_database_tables() RuntimeConfig.init_config(WORK_MODE=WORK_MODE) storage.init_storage(work_mode=RuntimeConfig.WORK_MODE) queue_manager.init_job_queue() job_controller.JobController.init() # start job detector job_detector.JobDetector(interval=5 * 1000).start() # start scheduler scheduler = dag_scheduler.DAGScheduler( queue=RuntimeConfig.JOB_QUEUE, concurrent_num=MAX_CONCURRENT_JOB_RUN) scheduler.start() # start grpc server server = grpc.server( futures.ThreadPoolExecutor(max_workers=10), options=[(cygrpc.ChannelArgKey.max_send_message_length, -1), (cygrpc.ChannelArgKey.max_receive_message_length, -1)])