Esempio n. 1
0
def init_job_queue():
    if RuntimeConfig.WORK_MODE == WorkMode.STANDALONE:
        job_queue = InProcessQueue()
        RuntimeConfig.init_config(JOB_QUEUE=job_queue)
    elif RuntimeConfig.WORK_MODE == WorkMode.CLUSTER:
        job_queue = RedisQueue(queue_name='fate_flow_job_queue', host=REDIS['host'], port=REDIS['port'],
                               password=REDIS['password'], max_connections=REDIS['max_connections'])
        RuntimeConfig.init_config(JOB_QUEUE=job_queue)
    else:
        raise Exception('init queue failed.')
Esempio n. 2
0
 def __init__(self):
     database_config = DATABASE.copy()
     db_name = database_config.pop("name")
     if WORK_MODE == WorkMode.STANDALONE:
         self.database_connection = APSWDatabase('fate_flow_sqlite.db')
         RuntimeConfig.init_config(USE_LOCAL_DATABASE=True)
         stat_logger.info('init sqlite database on standalone mode successfully')
     elif WORK_MODE == WorkMode.CLUSTER:
         self.database_connection = PooledMySQLDatabase(db_name, **database_config)
         stat_logger.info('init mysql database on cluster mode successfully')
         RuntimeConfig.init_config(USE_LOCAL_DATABASE=False)
     else:
         raise Exception('can not init database')
Esempio n. 3
0
 def save_pipelined_model(cls, job_id, role, party_id):
     schedule_logger(job_id).info(
         'job {} on {} {} start to save pipeline'.format(
             job_id, role, party_id))
     job_dsl, job_runtime_conf, train_runtime_conf = job_utils.get_job_configuration(
         job_id=job_id, role=role, party_id=party_id)
     job_parameters = job_runtime_conf.get('job_parameters', {})
     model_id = job_parameters['model_id']
     model_version = job_parameters['model_version']
     job_type = job_parameters.get('job_type', '')
     if job_type == 'predict':
         return
     dag = schedule_utils.get_job_dsl_parser(
         dsl=job_dsl,
         runtime_conf=job_runtime_conf,
         train_runtime_conf=train_runtime_conf)
     predict_dsl = dag.get_predict_dsl(role=role)
     pipeline = pipeline_pb2.Pipeline()
     pipeline.inference_dsl = json_dumps(predict_dsl, byte=True)
     pipeline.train_dsl = json_dumps(job_dsl, byte=True)
     pipeline.train_runtime_conf = json_dumps(job_runtime_conf, byte=True)
     pipeline.fate_version = RuntimeConfig.get_env("FATE")
     pipeline.model_id = model_id
     pipeline.model_version = model_version
     tracker = Tracker(job_id=job_id,
                       role=role,
                       party_id=party_id,
                       model_id=model_id,
                       model_version=model_version)
     tracker.save_pipelined_model(pipelined_buffer_object=pipeline)
     if role != 'local':
         tracker.save_machine_learning_model_info()
     schedule_logger(job_id).info(
         'job {} on {} {} save pipeline successfully'.format(
             job_id, role, party_id))
Esempio n. 4
0
    def save_pipelined_model(cls, job_id, role, party_id):
        schedule_logger(job_id).info(
            'job {} on {} {} start to save pipeline'.format(
                job_id, role, party_id))
        job_dsl, job_runtime_conf, runtime_conf_on_party, train_runtime_conf = job_utils.get_job_configuration(
            job_id=job_id, role=role, party_id=party_id)
        job_parameters = runtime_conf_on_party.get('job_parameters', {})
        if role in job_parameters.get("assistant_role", []):
            return
        model_id = job_parameters['model_id']
        model_version = job_parameters['model_version']
        job_type = job_parameters.get('job_type', '')
        work_mode = job_parameters['work_mode']
        roles = runtime_conf_on_party['role']
        initiator_role = runtime_conf_on_party['initiator']['role']
        initiator_party_id = runtime_conf_on_party['initiator']['party_id']
        if job_type == 'predict':
            return
        dag = schedule_utils.get_job_dsl_parser(
            dsl=job_dsl,
            runtime_conf=job_runtime_conf,
            train_runtime_conf=train_runtime_conf)
        predict_dsl = dag.get_predict_dsl(role=role)
        pipeline = pipeline_pb2.Pipeline()
        pipeline.inference_dsl = json_dumps(predict_dsl, byte=True)
        pipeline.train_dsl = json_dumps(job_dsl, byte=True)
        pipeline.train_runtime_conf = json_dumps(job_runtime_conf, byte=True)
        pipeline.fate_version = RuntimeConfig.get_env("FATE")
        pipeline.model_id = model_id
        pipeline.model_version = model_version

        pipeline.parent = True
        pipeline.loaded_times = 0
        pipeline.roles = json_dumps(roles, byte=True)
        pipeline.work_mode = work_mode
        pipeline.initiator_role = initiator_role
        pipeline.initiator_party_id = initiator_party_id
        pipeline.runtime_conf_on_party = json_dumps(runtime_conf_on_party,
                                                    byte=True)
        pipeline.parent_info = json_dumps({}, byte=True)

        tracker = Tracker(job_id=job_id,
                          role=role,
                          party_id=party_id,
                          model_id=model_id,
                          model_version=model_version)
        tracker.save_pipelined_model(pipelined_buffer_object=pipeline)
        if role != 'local':
            tracker.save_machine_learning_model_info()
        schedule_logger(job_id).info(
            'job {} on {} {} save pipeline successfully'.format(
                job_id, role, party_id))
Esempio n. 5
0
    def run_task(cls):
        task_info = {}
        try:
            parser = argparse.ArgumentParser()
            parser.add_argument('-j',
                                '--job_id',
                                required=True,
                                type=str,
                                help="job id")
            parser.add_argument('-n',
                                '--component_name',
                                required=True,
                                type=str,
                                help="component name")
            parser.add_argument('-t',
                                '--task_id',
                                required=True,
                                type=str,
                                help="task id")
            parser.add_argument('-v',
                                '--task_version',
                                required=True,
                                type=int,
                                help="task version")
            parser.add_argument('-r',
                                '--role',
                                required=True,
                                type=str,
                                help="role")
            parser.add_argument('-p',
                                '--party_id',
                                required=True,
                                type=int,
                                help="party id")
            parser.add_argument('-c',
                                '--config',
                                required=True,
                                type=str,
                                help="task parameters")
            parser.add_argument('--run_ip', help="run ip", type=str)
            parser.add_argument('--job_server', help="job server", type=str)
            args = parser.parse_args()
            schedule_logger(args.job_id).info('enter task process')
            schedule_logger(args.job_id).info(args)
            # init function args
            if args.job_server:
                RuntimeConfig.init_config(
                    JOB_SERVER_HOST=args.job_server.split(':')[0],
                    HTTP_PORT=args.job_server.split(':')[1])
                RuntimeConfig.set_process_role(ProcessRole.EXECUTOR)
            job_id = args.job_id
            component_name = args.component_name
            task_id = args.task_id
            task_version = args.task_version
            role = args.role
            party_id = args.party_id
            executor_pid = os.getpid()
            task_info.update({
                "job_id": job_id,
                "component_name": component_name,
                "task_id": task_id,
                "task_version": task_version,
                "role": role,
                "party_id": party_id,
                "run_ip": args.run_ip,
                "run_pid": executor_pid
            })
            start_time = current_timestamp()
            job_conf = job_utils.get_job_conf(job_id, role)
            job_dsl = job_conf["job_dsl_path"]
            job_runtime_conf = job_conf["job_runtime_conf_path"]
            dsl_parser = schedule_utils.get_job_dsl_parser(
                dsl=job_dsl,
                runtime_conf=job_runtime_conf,
                train_runtime_conf=job_conf["train_runtime_conf_path"],
                pipeline_dsl=job_conf["pipeline_dsl_path"])
            party_index = job_runtime_conf["role"][role].index(party_id)
            job_args_on_party = TaskExecutor.get_job_args_on_party(
                dsl_parser, job_runtime_conf, role, party_id)
            component = dsl_parser.get_component_info(
                component_name=component_name)
            component_parameters = component.get_role_parameters()
            component_parameters_on_party = component_parameters[role][
                party_index] if role in component_parameters else {}
            module_name = component.get_module()
            task_input_dsl = component.get_input()
            task_output_dsl = component.get_output()
            component_parameters_on_party[
                'output_data_name'] = task_output_dsl.get('data')
            task_parameters = RunParameters(
                **file_utils.load_json_conf(args.config))
            job_parameters = task_parameters
            if job_parameters.assistant_role:
                TaskExecutor.monkey_patch()
        except Exception as e:
            traceback.print_exc()
            schedule_logger().exception(e)
            task_info["party_status"] = TaskStatus.FAILED
            return
        try:
            job_log_dir = os.path.join(
                job_utils.get_job_log_directory(job_id=job_id), role,
                str(party_id))
            task_log_dir = os.path.join(job_log_dir, component_name)
            log.LoggerFactory.set_directory(directory=task_log_dir,
                                            parent_log_dir=job_log_dir,
                                            append_to_parent_log=True,
                                            force=True)

            tracker = Tracker(job_id=job_id,
                              role=role,
                              party_id=party_id,
                              component_name=component_name,
                              task_id=task_id,
                              task_version=task_version,
                              model_id=job_parameters.model_id,
                              model_version=job_parameters.model_version,
                              component_module_name=module_name,
                              job_parameters=job_parameters)
            tracker_client = TrackerClient(
                job_id=job_id,
                role=role,
                party_id=party_id,
                component_name=component_name,
                task_id=task_id,
                task_version=task_version,
                model_id=job_parameters.model_id,
                model_version=job_parameters.model_version,
                component_module_name=module_name,
                job_parameters=job_parameters)
            run_class_paths = component_parameters_on_party.get(
                'CodePath').split('/')
            run_class_package = '.'.join(
                run_class_paths[:-2]) + '.' + run_class_paths[-2].replace(
                    '.py', '')
            run_class_name = run_class_paths[-1]
            task_info["party_status"] = TaskStatus.RUNNING
            cls.report_task_update_to_driver(task_info=task_info)

            # init environment, process is shared globally
            RuntimeConfig.init_config(
                WORK_MODE=job_parameters.work_mode,
                COMPUTING_ENGINE=job_parameters.computing_engine,
                FEDERATION_ENGINE=job_parameters.federation_engine,
                FEDERATED_MODE=job_parameters.federated_mode)

            if RuntimeConfig.COMPUTING_ENGINE == ComputingEngine.EGGROLL:
                session_options = task_parameters.eggroll_run.copy()
            else:
                session_options = {}

            sess = session.Session(
                computing_type=job_parameters.computing_engine,
                federation_type=job_parameters.federation_engine)
            computing_session_id = job_utils.generate_session_id(
                task_id, task_version, role, party_id)
            sess.init_computing(computing_session_id=computing_session_id,
                                options=session_options)
            federation_session_id = job_utils.generate_task_version_id(
                task_id, task_version)
            component_parameters_on_party[
                "job_parameters"] = job_parameters.to_dict()
            sess.init_federation(
                federation_session_id=federation_session_id,
                runtime_conf=component_parameters_on_party,
                service_conf=job_parameters.engines_address.get(
                    EngineType.FEDERATION, {}))
            sess.as_default()

            schedule_logger().info('Run {} {} {} {} {} task'.format(
                job_id, component_name, task_id, role, party_id))
            schedule_logger().info("Component parameters on party {}".format(
                component_parameters_on_party))
            schedule_logger().info("Task input dsl {}".format(task_input_dsl))
            task_run_args = cls.get_task_run_args(
                job_id=job_id,
                role=role,
                party_id=party_id,
                task_id=task_id,
                task_version=task_version,
                job_args=job_args_on_party,
                job_parameters=job_parameters,
                task_parameters=task_parameters,
                input_dsl=task_input_dsl,
            )
            if module_name in {"Upload", "Download", "Reader", "Writer"}:
                task_run_args["job_parameters"] = job_parameters
            run_object = getattr(importlib.import_module(run_class_package),
                                 run_class_name)()
            run_object.set_tracker(tracker=tracker_client)
            run_object.set_task_version_id(
                task_version_id=job_utils.generate_task_version_id(
                    task_id, task_version))
            # add profile logs
            profile.profile_start()
            run_object.run(component_parameters_on_party, task_run_args)
            profile.profile_ends()
            output_data = run_object.save_data()
            if not isinstance(output_data, list):
                output_data = [output_data]
            for index in range(0, len(output_data)):
                data_name = task_output_dsl.get(
                    'data')[index] if task_output_dsl.get(
                        'data') else '{}'.format(index)
                persistent_table_namespace, persistent_table_name = tracker.save_output_data(
                    computing_table=output_data[index],
                    output_storage_engine=job_parameters.storage_engine,
                    output_storage_address=job_parameters.engines_address.get(
                        EngineType.STORAGE, {}))
                if persistent_table_namespace and persistent_table_name:
                    tracker.log_output_data_info(
                        data_name=data_name,
                        table_namespace=persistent_table_namespace,
                        table_name=persistent_table_name)
            output_model = run_object.export_model()
            # There is only one model output at the current dsl version.
            tracker.save_output_model(
                output_model, task_output_dsl['model'][0]
                if task_output_dsl.get('model') else 'default')
            task_info["party_status"] = TaskStatus.SUCCESS
        except Exception as e:
            task_info["party_status"] = TaskStatus.FAILED
            schedule_logger().exception(e)
        finally:
            try:
                task_info["end_time"] = current_timestamp()
                task_info["elapsed"] = task_info["end_time"] - start_time
                cls.report_task_update_to_driver(task_info=task_info)
            except Exception as e:
                task_info["party_status"] = TaskStatus.FAILED
                traceback.print_exc()
                schedule_logger().exception(e)
        schedule_logger().info('task {} {} {} start time: {}'.format(
            task_id, role, party_id, timestamp_to_date(start_time)))
        schedule_logger().info('task {} {} {} end time: {}'.format(
            task_id, role, party_id, timestamp_to_date(task_info["end_time"])))
        schedule_logger().info('task {} {} {} takes {}s'.format(
            task_id, role, party_id,
            int(task_info["elapsed"]) / 1000))
        schedule_logger().info('Finish {} {} {} {} {} {} task {}'.format(
            job_id, component_name, task_id, task_version, role, party_id,
            task_info["party_status"]))

        print('Finish {} {} {} {} {} {} task {}'.format(
            job_id, component_name, task_id, task_version, role, party_id,
            task_info["party_status"]))
        return task_info
Esempio n. 6
0
        })
    # init
    signal.signal(signal.SIGTERM, job_utils.cleaning)
    signal.signal(signal.SIGCHLD, job_utils.wait_child_process)
    # init db
    init_flow_db()
    init_arch_db()
    # init runtime config
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--standalone_node',
                        default=False,
                        help="if standalone node mode or not ",
                        action='store_true')
    args = parser.parse_args()
    RuntimeConfig.init_env()
    RuntimeConfig.set_process_role(ProcessRole.DRIVER)
    PrivilegeAuth.init()
    ServiceUtils.register()
    ResourceManager.initialize()
    Detector(interval=5 * 1000).start()
    DAGScheduler(interval=2 * 1000).start()
    server = grpc.server(
        futures.ThreadPoolExecutor(max_workers=10),
        options=[(cygrpc.ChannelArgKey.max_send_message_length, -1),
                 (cygrpc.ChannelArgKey.max_receive_message_length, -1)])

    proxy_pb2_grpc.add_DataTransferServiceServicer_to_server(
        UnaryService(), server)
    server.add_insecure_port("{}:{}".format(IP, GRPC_PORT))
    server.start()
Esempio n. 7
0
    'host': 'mysql',
    'port': 3306,
    'max_connections': 100,
    'stale_timeout': 30,
}

REDIS = {
    'host': 'redis',
    'port': 6379,
    'password': '******',
    'max_connections': 500
}

REDIS_QUEUE_DB_INDEX = 0
JOB_MODULE_CONF = file_utils.load_json_conf("fate_flow/job_module_conf.json")

"""
Services
"""
server_conf = file_utils.load_json_conf("arch/conf/server_conf.json")
PROXY_HOST = server_conf.get(SERVERS).get('proxy').get('host')
PROXY_PORT = server_conf.get(SERVERS).get('proxy').get('port')
BOARD_HOST = server_conf.get(SERVERS).get('fateboard').get('host')
if BOARD_HOST == 'localhost':
    BOARD_HOST = get_lan_ip()
BOARD_PORT = server_conf.get(SERVERS).get('fateboard').get('port')
SERVINGS = server_conf.get(SERVERS).get('servings')
BOARD_DASHBOARD_URL = 'http://%s:%d/index.html#/dashboard?job_id={}&role={}&party_id={}' % (BOARD_HOST, BOARD_PORT)
RuntimeConfig.init_config(WORK_MODE=WORK_MODE)
RuntimeConfig.init_config(HTTP_PORT=HTTP_PORT)
Esempio n. 8
0
    def run_task():
        task = Task()
        task.f_create_time = current_timestamp()
        try:
            parser = argparse.ArgumentParser()
            parser.add_argument('-j', '--job_id', required=True, type=str, help="job id")
            parser.add_argument('-n', '--component_name', required=True, type=str,
                                help="component name")
            parser.add_argument('-t', '--task_id', required=True, type=str, help="task id")
            parser.add_argument('-r', '--role', required=True, type=str, help="role")
            parser.add_argument('-p', '--party_id', required=True, type=str, help="party id")
            parser.add_argument('-c', '--config', required=True, type=str, help="task config")
            parser.add_argument('--processors_per_node', help="processors_per_node", type=int)
            parser.add_argument('--job_server', help="job server", type=str)
            args = parser.parse_args()
            schedule_logger(args.job_id).info('enter task process')
            schedule_logger(args.job_id).info(args)
            # init function args
            if args.job_server:
                RuntimeConfig.init_config(HTTP_PORT=args.job_server.split(':')[1])
                RuntimeConfig.set_process_role(ProcessRole.EXECUTOR)
            job_id = args.job_id
            component_name = args.component_name
            task_id = args.task_id
            role = args.role
            party_id = int(args.party_id)
            executor_pid = os.getpid()
            task_config = file_utils.load_json_conf(args.config)
            job_parameters = task_config['job_parameters']
            job_initiator = task_config['job_initiator']
            job_args = task_config['job_args']
            task_input_dsl = task_config['input']
            task_output_dsl = task_config['output']
            component_parameters = TaskExecutor.get_parameters(job_id, component_name, role, party_id)
            task_parameters = task_config['task_parameters']
            module_name = task_config['module_name']
            TaskExecutor.monkey_patch()
        except Exception as e:
            traceback.print_exc()
            schedule_logger().exception(e)
            task.f_status = TaskStatus.FAILED
            return
        try:
            job_log_dir = os.path.join(job_utils.get_job_log_directory(job_id=job_id), role, str(party_id))
            task_log_dir = os.path.join(job_log_dir, component_name)
            log_utils.LoggerFactory.set_directory(directory=task_log_dir, parent_log_dir=job_log_dir,
                                                  append_to_parent_log=True, force=True)

            task.f_job_id = job_id
            task.f_component_name = component_name
            task.f_task_id = task_id
            task.f_role = role
            task.f_party_id = party_id
            task.f_operator = 'python_operator'
            tracker = Tracking(job_id=job_id, role=role, party_id=party_id, component_name=component_name,
                               task_id=task_id,
                               model_id=job_parameters['model_id'],
                               model_version=job_parameters['model_version'],
                               component_module_name=module_name)
            task.f_start_time = current_timestamp()
            task.f_run_ip = get_lan_ip()
            task.f_run_pid = executor_pid
            run_class_paths = component_parameters.get('CodePath').split('/')
            run_class_package = '.'.join(run_class_paths[:-2]) + '.' + run_class_paths[-2].replace('.py', '')
            run_class_name = run_class_paths[-1]
            task.f_status = TaskStatus.RUNNING
            TaskExecutor.sync_task_status(job_id=job_id, component_name=component_name, task_id=task_id, role=role,
                                          party_id=party_id, initiator_party_id=job_initiator.get('party_id', None),
                                          initiator_role=job_initiator.get('role', None),
                                          task_info=task.to_json())

            # init environment, process is shared globally
            RuntimeConfig.init_config(WORK_MODE=job_parameters['work_mode'],
                                      BACKEND=job_parameters.get('backend', 0))
            if args.processors_per_node and args.processors_per_node > 0 and RuntimeConfig.BACKEND == Backend.EGGROLL:
                session_options = {"eggroll.session.processors.per.node": args.processors_per_node}
            else:
                session_options = {}
            session.init(job_id=job_utils.generate_session_id(task_id, role, party_id),
                         mode=RuntimeConfig.WORK_MODE,
                         backend=RuntimeConfig.BACKEND,
                         options=session_options)
            federation.init(job_id=task_id, runtime_conf=component_parameters)

            schedule_logger().info('run {} {} {} {} {} task'.format(job_id, component_name, task_id, role, party_id))
            schedule_logger().info(component_parameters)
            schedule_logger().info(task_input_dsl)
            task_run_args = TaskExecutor.get_task_run_args(job_id=job_id, role=role, party_id=party_id,
                                                           task_id=task_id,
                                                           job_args=job_args,
                                                           job_parameters=job_parameters,
                                                           task_parameters=task_parameters,
                                                           input_dsl=task_input_dsl,
                                                           if_save_as_task_input_data=job_parameters.get("save_as_task_input_data", SAVE_AS_TASK_INPUT_DATA_SWITCH)
                                                           )
            run_object = getattr(importlib.import_module(run_class_package), run_class_name)()
            run_object.set_tracker(tracker=tracker)
            run_object.set_taskid(taskid=task_id)
            run_object.run(component_parameters, task_run_args)
            output_data = run_object.save_data()
            tracker.save_output_data_table(output_data, task_output_dsl.get('data')[0] if task_output_dsl.get('data') else 'component')
            output_model = run_object.export_model()
            # There is only one model output at the current dsl version.
            tracker.save_output_model(output_model, task_output_dsl['model'][0] if task_output_dsl.get('model') else 'default')
            task.f_status = TaskStatus.COMPLETE
        except Exception as e:
            task.f_status = TaskStatus.FAILED
            schedule_logger().exception(e)
        finally:
            sync_success = False
            try:
                task.f_end_time = current_timestamp()
                task.f_elapsed = task.f_end_time - task.f_start_time
                task.f_update_time = current_timestamp()
                TaskExecutor.sync_task_status(job_id=job_id, component_name=component_name, task_id=task_id, role=role,
                                              party_id=party_id,
                                              initiator_party_id=job_initiator.get('party_id', None),
                                              initiator_role=job_initiator.get('role', None),
                                              task_info=task.to_json())
                sync_success = True
            except Exception as e:
                traceback.print_exc()
                schedule_logger().exception(e)
        schedule_logger().info('task {} {} {} start time: {}'.format(task_id, role, party_id, timestamp_to_date(task.f_start_time)))
        schedule_logger().info('task {} {} {} end time: {}'.format(task_id, role, party_id, timestamp_to_date(task.f_end_time)))
        schedule_logger().info('task {} {} {} takes {}s'.format(task_id, role, party_id, int(task.f_elapsed)/1000))
        schedule_logger().info(
            'finish {} {} {} {} {} {} task'.format(job_id, component_name, task_id, role, party_id, task.f_status if sync_success else TaskStatus.FAILED))

        print('finish {} {} {} {} {} {} task'.format(job_id, component_name, task_id, role, party_id, task.f_status if sync_success else TaskStatus.FAILED))
Esempio n. 9
0
def get_fate_version_info():
    version = RuntimeConfig.get_env(request.json.get('module', 'FATE'))
    return get_json_result(data={request.json.get('module'): version})
Esempio n. 10
0
         '/{}/forward'.format(API_VERSION): forward_app_manager
     })
 # init
 signal.signal(signal.SIGTERM, job_utils.cleaning)
 signal.signal(signal.SIGCHLD, job_utils.wait_child_process)
 init_database_tables()
 # init runtime config
 import argparse
 parser = argparse.ArgumentParser()
 parser.add_argument('--standalone_node',
                     default=False,
                     help="if standalone node mode or not ",
                     action='store_true')
 args = parser.parse_args()
 if args.standalone_node:
     RuntimeConfig.init_config(WORK_MODE=WorkMode.STANDALONE)
     RuntimeConfig.init_config(HTTP_PORT=CLUSTER_STANDALONE_JOB_SERVER_PORT)
 session_utils.init_session_for_flow_server()
 RuntimeConfig.init_env()
 RuntimeConfig.set_process_role(ProcessRole.SERVER)
 queue_manager.init_job_queue()
 job_controller.JobController.init()
 history_job_clean = job_controller.JobClean()
 history_job_clean.start()
 PrivilegeAuth.init()
 ServiceUtils.register()
 # start job detector
 job_detector.JobDetector(interval=5 * 1000).start()
 # start scheduler
 scheduler = dag_scheduler.DAGScheduler(
     queue=RuntimeConfig.JOB_QUEUE, concurrent_num=MAX_CONCURRENT_JOB_RUN)
Esempio n. 11
0
    def run_task():
        task = Task()
        task.f_create_time = current_timestamp()
        try:
            parser = argparse.ArgumentParser()
            parser.add_argument('-j',
                                '--job_id',
                                required=True,
                                type=str,
                                help="job id")
            parser.add_argument('-n',
                                '--component_name',
                                required=True,
                                type=str,
                                help="component name")
            parser.add_argument('-t',
                                '--task_id',
                                required=True,
                                type=str,
                                help="task id")
            parser.add_argument('-r',
                                '--role',
                                required=True,
                                type=str,
                                help="role")
            parser.add_argument('-p',
                                '--party_id',
                                required=True,
                                type=str,
                                help="party id")
            parser.add_argument('-c',
                                '--config',
                                required=True,
                                type=str,
                                help="task config")
            parser.add_argument('--job_server', help="job server", type=str)
            args = parser.parse_args()
            schedule_logger.info('enter task process')
            schedule_logger.info(args)
            # init function args
            if args.job_server:
                RuntimeConfig.init_config(
                    HTTP_PORT=args.job_server.split(':')[1])
            job_id = args.job_id
            component_name = args.component_name
            task_id = args.task_id
            role = args.role
            party_id = int(args.party_id)
            task_config = file_utils.load_json_conf(args.config)
            job_parameters = task_config['job_parameters']
            job_initiator = task_config['job_initiator']
            job_args = task_config['job_args']
            task_input_dsl = task_config['input']
            task_output_dsl = task_config['output']
            parameters = task_config['parameters']
            module_name = task_config['module_name']
        except Exception as e:
            schedule_logger.exception(e)
            task.f_status = TaskStatus.FAILED
            return
        try:
            # init environment, process is shared globally
            RuntimeConfig.init_config(WORK_MODE=job_parameters['work_mode'])
            storage.init_storage(job_id=task_id,
                                 work_mode=RuntimeConfig.WORK_MODE)
            federation.init(job_id=task_id, runtime_conf=parameters)
            job_log_dir = os.path.join(
                job_utils.get_job_log_directory(job_id=job_id), role,
                str(party_id))
            task_log_dir = os.path.join(job_log_dir, component_name)
            log_utils.LoggerFactory.set_directory(directory=task_log_dir,
                                                  parent_log_dir=job_log_dir,
                                                  append_to_parent_log=True,
                                                  force=True)

            task.f_job_id = job_id
            task.f_component_name = component_name
            task.f_task_id = task_id
            task.f_role = role
            task.f_party_id = party_id
            task.f_operator = 'python_operator'
            tracker = Tracking(job_id=job_id,
                               role=role,
                               party_id=party_id,
                               component_name=component_name,
                               task_id=task_id,
                               model_id=job_parameters['model_id'],
                               model_version=job_parameters['model_version'],
                               module_name=module_name)
            task.f_start_time = current_timestamp()
            task.f_run_ip = get_lan_ip()
            task.f_run_pid = os.getpid()
            run_class_paths = parameters.get('CodePath').split('/')
            run_class_package = '.'.join(
                run_class_paths[:-2]) + '.' + run_class_paths[-2].replace(
                    '.py', '')
            run_class_name = run_class_paths[-1]
            task_run_args = TaskExecutor.get_task_run_args(
                job_id=job_id,
                role=role,
                party_id=party_id,
                job_parameters=job_parameters,
                job_args=job_args,
                input_dsl=task_input_dsl)
            run_object = getattr(importlib.import_module(run_class_package),
                                 run_class_name)()
            run_object.set_tracker(tracker=tracker)
            run_object.set_taskid(taskid=task_id)
            task.f_status = TaskStatus.RUNNING
            TaskExecutor.sync_task_status(job_id=job_id,
                                          component_name=component_name,
                                          task_id=task_id,
                                          role=role,
                                          party_id=party_id,
                                          initiator_party_id=job_initiator.get(
                                              'party_id', None),
                                          task_info=task.to_json())

            schedule_logger.info('run {} {} {} {} {} task'.format(
                job_id, component_name, task_id, role, party_id))
            schedule_logger.info(parameters)
            schedule_logger.info(task_input_dsl)
            run_object.run(parameters, task_run_args)
            if task_output_dsl:
                if task_output_dsl.get('data', []):
                    output_data = run_object.save_data()
                    tracker.save_output_data_table(
                        output_data,
                        task_output_dsl.get('data')[0])
                if task_output_dsl.get('model', []):
                    output_model = run_object.export_model()
                    # There is only one model output at the current dsl version.
                    tracker.save_output_model(output_model,
                                              task_output_dsl['model'][0])
            task.f_status = TaskStatus.SUCCESS
        except Exception as e:
            schedule_logger.exception(e)
            task.f_status = TaskStatus.FAILED
        finally:
            try:
                task.f_end_time = current_timestamp()
                task.f_elapsed = task.f_end_time - task.f_start_time
                task.f_update_time = current_timestamp()
                TaskExecutor.sync_task_status(
                    job_id=job_id,
                    component_name=component_name,
                    task_id=task_id,
                    role=role,
                    party_id=party_id,
                    initiator_party_id=job_initiator.get('party_id', None),
                    task_info=task.to_json())
            except Exception as e:
                schedule_logger.exception(e)
        schedule_logger.info('finish {} {} {} {} {} {} task'.format(
            job_id, component_name, task_id, role, party_id, task.f_status))
        print('finish {} {} {} {} {} {} task'.format(job_id, component_name,
                                                     task_id, role, party_id,
                                                     task.f_status))
Esempio n. 12
0
            '/{}/pipeline'.format(API_VERSION): pipeline_app_manager,
            '/{}/schedule'.format(API_VERSION): schedule_app_manager
        })
    # init
    signal.signal(signal.SIGCHLD, job_utils.wait_child_process)
    init_database_tables()
    # init runtime config
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--standalone_node',
                        default=False,
                        help="if standalone node mode or not ",
                        action='store_true')
    args = parser.parse_args()
    if args.standalone_node:
        RuntimeConfig.init_config(WORK_MODE=WorkMode.STANDALONE)
        RuntimeConfig.init_config(HTTP_PORT=CLUSTER_STANDALONE_JOB_SERVER_PORT)

    storage.init_storage(work_mode=RuntimeConfig.WORK_MODE)
    queue_manager.init_job_queue()
    job_controller.JobController.init()
    # start job detector
    job_detector.JobDetector(interval=5 * 1000).start()
    # start scheduler
    scheduler = dag_scheduler.DAGScheduler(
        queue=RuntimeConfig.JOB_QUEUE, concurrent_num=MAX_CONCURRENT_JOB_RUN)
    scheduler.start()
    # start grpc server
    server = grpc.server(
        futures.ThreadPoolExecutor(max_workers=10),
        options=[(cygrpc.ChannelArgKey.max_send_message_length, -1),
Esempio n. 13
0
# Endpoint
FATE_FLOW_MODEL_TRANSFER_ENDPOINT = "/v1/model/transfer"
FATE_MANAGER_GET_NODE_INFO_ENDPOINT = "/fate-manager/api/site/secretinfo"
FATE_MANAGER_NODE_CHECK_ENDPOINT = "/fate-manager/api/site/checksite"
FATE_BOARD_DASHBOARD_ENDPOINT = "/index.html#/dashboard?job_id={}&role={}&party_id={}"

# Logger
log.LoggerFactory.LEVEL = 10
# {CRITICAL: 50, FATAL:50, ERROR:40, WARNING:30, WARN:30, INFO:20, DEBUG:10, NOTSET:0}
log.LoggerFactory.set_directory(
    os.path.join(file_utils.get_project_base_directory(), "logs", "fate_flow"))
stat_logger = log.getLogger("fate_flow_stat")
detect_logger = log.getLogger("fate_flow_detect")
access_logger = log.getLogger("fate_flow_access")
data_manager_logger = log.getLogger("fate_flow_data_manager")
peewee_logger = log.getLogger("peewee")

# Switch
UPLOAD_DATA_FROM_CLIENT = True
USE_AUTHENTICATION = False
PRIVILEGE_COMMAND_WHITELIST = []
CHECK_NODES_IDENTITY = False
DEFAULT_FEDERATED_STATUS_COLLECT_TYPE = get_base_config(
    FATEFLOW_SERVICE_NAME, {}).get("default_federated_status_collect_type",
                                   "PUSH")

# Init
RuntimeConfig.init_config(WORK_MODE=WORK_MODE)
RuntimeConfig.init_config(JOB_SERVER_HOST=IP, HTTP_PORT=HTTP_PORT)
Esempio n. 14
0
if __name__ == '__main__':
    manager.url_map.strict_slashes = False
    app = DispatcherMiddleware(
        manager, {
            '/{}/data'.format(API_VERSION): data_access_app_manager,
            '/{}/model'.format(API_VERSION): model_app_manager,
            '/{}/job'.format(API_VERSION): job_app_manager,
            '/{}/table'.format(API_VERSION): table_app_manager,
            '/{}/tracking'.format(API_VERSION): tracking_app_manager,
            '/{}/pipeline'.format(API_VERSION): pipeline_app_manager,
        })
    # init
    signal.signal(signal.SIGCHLD, job_utils.wait_child_process)
    init_database_tables()
    RuntimeConfig.init_config(WORK_MODE=WORK_MODE)
    storage.init_storage(work_mode=RuntimeConfig.WORK_MODE)
    queue_manager.init_job_queue()
    job_controller.JobController.init()
    # start job detector
    job_detector.JobDetector(interval=5 * 1000).start()
    # start scheduler
    scheduler = dag_scheduler.DAGScheduler(
        queue=RuntimeConfig.JOB_QUEUE, concurrent_num=MAX_CONCURRENT_JOB_RUN)
    scheduler.start()
    # start grpc server
    server = grpc.server(
        futures.ThreadPoolExecutor(max_workers=10),
        options=[(cygrpc.ChannelArgKey.max_send_message_length, -1),
                 (cygrpc.ChannelArgKey.max_receive_message_length, -1)])