Example #1
0
 def check_task_status(job_id, component, interval=1):
     task_id = job_utils.generate_task_id(
         job_id=job_id, component_name=component.get_name())
     while True:
         try:
             status_collect = set()
             parameters = component.get_role_parameters()
             for _role, _partys_parameters in parameters.items():
                 for _party_parameters in _partys_parameters:
                     _party_id = _party_parameters.get('local',
                                                       {}).get('party_id')
                     tasks = query_task(job_id=job_id,
                                        task_id=task_id,
                                        role=_role,
                                        party_id=_party_id)
                     if tasks:
                         task_status = tasks[0].f_status
                     else:
                         task_status = 'notRunning'
                     schedule_logger.info(
                         'job {} component {} run on {} {} status is {}'.
                         format(job_id, component.get_name(), _role,
                                _party_id, task_status))
                     status_collect.add(task_status)
             if 'failed' in status_collect:
                 return False
             elif len(status_collect) == 1 and 'success' in status_collect:
                 return True
             else:
                 time.sleep(interval)
         except Exception as e:
             schedule_logger.exception(e)
             return False
Example #2
0
    def submit_job(job_data):
        job_id = generate_job_id()
        schedule_logger.info('submit job, job_id {}, body {}'.format(job_id, job_data))
        job_dsl = job_data.get('job_dsl', {})
        job_runtime_conf = job_data.get('job_runtime_conf', {})
        job_utils.check_pipeline_job_runtime_conf(job_runtime_conf)
        job_parameters = job_runtime_conf['job_parameters']
        job_initiator = job_runtime_conf['initiator']
        job_type = job_parameters.get('job_type', '')
        if job_type != 'predict':
            # generate job model info
            job_parameters['model_id'] = '#'.join([dtable_utils.all_party_key(job_runtime_conf['role']), 'model'])
            job_parameters['model_version'] = job_id
            train_runtime_conf = {}
        else:
            detect_utils.check_config(job_parameters, ['model_id', 'model_version'])
            # get inference dsl from pipeline model as job dsl
            job_tracker = Tracking(job_id=job_id, role=job_initiator['role'], party_id=job_initiator['party_id'],
                                   model_id=job_parameters['model_id'], model_version=job_parameters['model_version'])
            pipeline_model = job_tracker.get_output_model('pipeline')
            job_dsl = json_loads(pipeline_model['Pipeline'].inference_dsl)
            train_runtime_conf = json_loads(pipeline_model['Pipeline'].train_runtime_conf)
        job_dsl_path, job_runtime_conf_path = save_job_conf(job_id=job_id,
                                                            job_dsl=job_dsl,
                                                            job_runtime_conf=job_runtime_conf)

        job = Job()
        job.f_job_id = job_id
        job.f_roles = json_dumps(job_runtime_conf['role'])
        job.f_work_mode = job_parameters['work_mode']
        job.f_initiator_party_id = job_initiator['party_id']
        job.f_dsl = json_dumps(job_dsl)
        job.f_runtime_conf = json_dumps(job_runtime_conf)
        job.f_train_runtime_conf = json_dumps(train_runtime_conf)
        job.f_run_ip = ''
        job.f_status = JobStatus.WAITING
        job.f_progress = 0
        job.f_create_time = current_timestamp()

        # save job info
        TaskScheduler.distribute_job(job=job, roles=job_runtime_conf['role'], job_initiator=job_initiator)

        # push into queue
        RuntimeConfig.JOB_QUEUE.put_event({
            'job_id': job_id,
            "initiator_role": job_initiator['role'],
            "initiator_party_id": job_initiator['party_id']
        }
        )
        schedule_logger.info(
            'submit job successfully, job id is {}, model id is {}'.format(job.f_job_id, job_parameters['model_id']))
        board_url = BOARD_DASHBOARD_URL.format(job_id, job_initiator['role'], job_initiator['party_id'])
        return job_id, job_dsl_path, job_runtime_conf_path, {'model_id': job_parameters['model_id'],
                                                             'model_version': job_parameters[
                                                                 'model_version']}, board_url
Example #3
0
 def run_do(self):
     try:
         running_tasks = job_utils.query_task(status='running',
                                              run_ip=get_lan_ip())
         stop_job_ids = set()
         detect_logger.info('start to detect running job..')
         for task in running_tasks:
             try:
                 process_exist = job_utils.check_job_process(
                     int(task.f_run_pid))
                 if not process_exist:
                     detect_logger.info(
                         'job {} component {} on {} {} task {} {} process does not exist'
                         .format(task.f_job_id, task.f_component_name,
                                 task.f_role, task.f_party_id,
                                 task.f_task_id, task.f_run_pid))
                     stop_job_ids.add(task.f_job_id)
             except Exception as e:
                 detect_logger.exception(e)
         if stop_job_ids:
             schedule_logger.info(
                 'start to stop jobs: {}'.format(stop_job_ids))
         for job_id in stop_job_ids:
             jobs = job_utils.query_job(job_id=job_id)
             if jobs:
                 initiator_party_id = jobs[0].f_initiator_party_id
                 job_work_mode = jobs[0].f_work_mode
                 if len(jobs) > 1:
                     # i am initiator
                     my_party_id = initiator_party_id
                 else:
                     my_party_id = jobs[0].f_party_id
                     initiator_party_id = jobs[0].f_initiator_party_id
                 api_utils.federated_api(
                     job_id=job_id,
                     method='POST',
                     endpoint='/{}/job/stop'.format(API_VERSION),
                     src_party_id=my_party_id,
                     dest_party_id=initiator_party_id,
                     json_body={'job_id': job_id},
                     work_mode=job_work_mode)
                 schedule_logger.info(
                     'send stop job {} command'.format(job_id))
     except Exception as e:
         detect_logger.exception(e)
     finally:
         detect_logger.info('finish detect running job')
Example #4
0
 def run(self):
     if not self.queue.is_ready():
         schedule_logger.error('queue is not ready')
         return False
     all_jobs = []
     while True:
         try:
             if len(all_jobs) == self.concurrent_num:
                 for future in as_completed(all_jobs):
                     all_jobs.remove(future)
                     break
             job_event = self.queue.get_event()
             schedule_logger.info('schedule job {}'.format(job_event))
             future = self.job_executor_pool.submit(DAGScheduler.handle_event, job_event)
             future.add_done_callback(DAGScheduler.get_result)
             all_jobs.append(future)
         except Exception as e:
             schedule_logger.exception(e)
Example #5
0
 def kill_job(job_id, role, party_id, job_initiator):
     schedule_logger.info('{} {} get kill job {} command'.format(role, party_id, job_id))
     tasks = job_utils.query_task(job_id=job_id, role=role, party_id=party_id)
     for task in tasks:
         kill_status = False
         try:
             kill_status = job_utils.kill_process(int(task.f_run_pid))
         except Exception as e:
             schedule_logger.exception(e)
         finally:
             schedule_logger.info(
                 'job {} component {} on {} {} process {} kill {}'.format(job_id, task.f_component_name, task.f_role,
                                                                          task.f_party_id, task.f_run_pid,
                                                                          'success' if kill_status else 'failed'))
         if task.f_status != TaskStatus.SUCCESS:
             task.f_status = TaskStatus.FAILED
         TaskExecutor.sync_task_status(job_id=job_id, component_name=task.f_component_name, task_id=task.f_task_id,
                                       role=role,
                                       party_id=party_id, initiator_party_id=job_initiator.get('party_id', None),
                                       task_info=task.to_json())
Example #6
0
 def clean_job(job_id, role, party_id):
     schedule_logger.info('job {} on {} {} start to clean'.format(job_id, role, party_id))
     tasks = job_utils.query_task(job_id=job_id, role=role, party_id=party_id)
     for task in tasks:
         try:
             Tracking(job_id=job_id, role=role, party_id=party_id, task_id=task.f_task_id).clean_task()
             schedule_logger.info(
                 'job {} component {} on {} {} clean done'.format(job_id, task.f_component_name, role, party_id))
         except Exception as e:
             schedule_logger.info(
                 'job {} component {} on {} {} clean failed'.format(job_id, task.f_component_name, role, party_id))
             schedule_logger.exception(e)
     schedule_logger.info('job {} on {} {} clean done'.format(job_id, role, party_id))
Example #7
0
 def start_task(job_id, component_name, task_id, role, party_id,
                task_config):
     schedule_logger.info('job {} {} {} {} task subprocess is ready'.format(
         job_id, component_name, role, party_id, task_config))
     task_process_start_status = False
     try:
         task_dir = os.path.join(job_utils.get_job_directory(job_id=job_id),
                                 role, party_id, component_name)
         os.makedirs(task_dir, exist_ok=True)
         task_config_path = os.path.join(task_dir, 'task_config.json')
         with open(task_config_path, 'w') as fw:
             json.dump(task_config, fw)
         process_cmd = [
             'python3', sys.modules[TaskExecutor.__module__].__file__, '-j',
             job_id, '-n', component_name, '-t', task_id, '-r', role, '-p',
             party_id, '-c', task_config_path
         ]
         task_log_dir = os.path.join(
             job_utils.get_job_log_directory(job_id=job_id), role, party_id,
             component_name)
         schedule_logger.info(
             'job {} {} {} {} task subprocess start'.format(
                 job_id, component_name, role, party_id, task_config))
         p = job_utils.run_subprocess(config_dir=task_dir,
                                      process_cmd=process_cmd,
                                      log_dir=task_log_dir)
         if p:
             task_process_start_status = True
     except Exception as e:
         schedule_logger.exception(e)
     finally:
         schedule_logger.info(
             'job {} component {} on {} {} start task subprocess {}'.format(
                 job_id, component_name, role, party_id,
                 'success' if task_process_start_status else 'failed'))
Example #8
0
 def check_dependencies(job_id, dag, component):
     dependencies = dag.get_dependency().get('dependencies', {})
     if not dependencies:
         return False
     dependent_component_names = dependencies.get(component.get_name(), [])
     schedule_logger.info(
         'job {} component {} all dependent component: {}'.format(
             job_id, component.get_name(), dependent_component_names))
     for dependent_component_name in dependent_component_names:
         dependent_component = dag.get_component_info(
             dependent_component_name)
         dependent_component_task_status = TaskScheduler.check_task_status(
             job_id, dependent_component)
         schedule_logger.info(
             'job {} component {} dependency {} status is {}'.format(
                 job_id, component.get_name(), dependent_component_name,
                 dependent_component_task_status))
         if not dependent_component_task_status:
             # dependency component run failed, break
             return False
     else:
         return True
Example #9
0
    def stop_job(job_id):
        schedule_logger.info('get stop job {} command'.format(job_id))
        jobs = job_utils.query_job(job_id=job_id, is_initiator=1)
        if jobs:
            initiator_job = jobs[0]
            job_info = {'f_job_id': job_id, 'f_status': JobStatus.FAILED}
            roles = json_loads(initiator_job.f_roles)
            job_work_mode = initiator_job.f_work_mode
            initiator_party_id = initiator_job.f_party_id

            # set status first
            TaskScheduler.sync_job_status(
                job_id=job_id,
                roles=roles,
                initiator_party_id=initiator_party_id,
                work_mode=job_work_mode,
                job_info=job_info)
            for role, partys in roles.items():
                for party_id in partys:
                    response = federated_api(
                        job_id=job_id,
                        method='POST',
                        endpoint='/{}/job/{}/{}/{}/kill'.format(
                            API_VERSION, job_id, role, party_id),
                        src_party_id=initiator_party_id,
                        dest_party_id=party_id,
                        json_body={
                            'job_initiator': {
                                'party_id': initiator_job.f_party_id,
                                'role': initiator_job.f_role
                            }
                        },
                        work_mode=job_work_mode)
                    if response['retcode'] == 0:
                        schedule_logger.info(
                            'send {} {} kill job {} command successfully'.
                            format(role, party_id, job_id))
                    else:
                        schedule_logger.info(
                            'send {} {} kill job {} command failed: {}'.format(
                                role, party_id, job_id, response['retmsg']))
        else:
            schedule_logger.info(
                'send stop job {} command failed'.format(job_id))
            raise Exception('can not found job: {}'.format(job_id))
Example #10
0
    def run_job(job_id, initiator_role, initiator_party_id):
        job_dsl, job_runtime_conf, train_runtime_conf = job_utils.get_job_configuration(
            job_id=job_id, role=initiator_role, party_id=initiator_party_id)
        job_parameters = job_runtime_conf.get('job_parameters', {})
        job_initiator = job_runtime_conf.get('initiator', {})
        dag = get_job_dsl_parser(dsl=job_dsl,
                                 runtime_conf=job_runtime_conf,
                                 train_runtime_conf=train_runtime_conf)
        job_args = dag.get_args_input()
        if not job_initiator:
            return False
        storage.init_storage(job_id=job_id, work_mode=RuntimeConfig.WORK_MODE)
        job = Job()
        job.f_job_id = job_id
        job.f_start_time = current_timestamp()
        job.f_status = JobStatus.RUNNING
        job.f_update_time = current_timestamp()
        TaskScheduler.sync_job_status(
            job_id=job_id,
            roles=job_runtime_conf['role'],
            work_mode=job_parameters['work_mode'],
            initiator_party_id=job_initiator['party_id'],
            job_info=job.to_json())

        top_level_task_status = set()
        components = dag.get_next_components(None)
        schedule_logger.info('job {} root components is {}'.format(
            job.f_job_id, [component.get_name() for component in components],
            None))
        for component in components:
            try:
                # run a component as task
                run_status = TaskScheduler.run_component(
                    job_id, job_runtime_conf, job_parameters, job_initiator,
                    job_args, dag, component)
            except Exception as e:
                schedule_logger.info(e)
                run_status = False
            top_level_task_status.add(run_status)
            if not run_status:
                break
        if len(top_level_task_status) == 2:
            job.f_status = JobStatus.PARTIAL
        elif True in top_level_task_status:
            job.f_status = JobStatus.SUCCESS
        else:
            job.f_status = JobStatus.FAILED
        job.f_end_time = current_timestamp()
        job.f_elapsed = job.f_end_time - job.f_start_time
        if job.f_status == JobStatus.SUCCESS:
            job.f_progress = 100
        job.f_update_time = current_timestamp()
        TaskScheduler.sync_job_status(
            job_id=job_id,
            roles=job_runtime_conf['role'],
            work_mode=job_parameters['work_mode'],
            initiator_party_id=job_initiator['party_id'],
            job_info=job.to_json())
        TaskScheduler.finish_job(job_id=job_id,
                                 job_runtime_conf=job_runtime_conf)
        schedule_logger.info('job {} finished, status is {}'.format(
            job.f_job_id, job.f_status))
Example #11
0
    def run_component(job_id, job_runtime_conf, job_parameters, job_initiator,
                      job_args, dag, component):
        parameters = component.get_role_parameters()
        component_name = component.get_name()
        module_name = component.get_module()
        task_id = job_utils.generate_task_id(job_id=job_id,
                                             component_name=component_name)
        schedule_logger.info('job {} run component {}'.format(
            job_id, component_name))
        for role, partys_parameters in parameters.items():
            for party_index in range(len(partys_parameters)):
                party_parameters = partys_parameters[party_index]
                if role in job_args:
                    party_job_args = job_args[role][party_index]['args']
                else:
                    party_job_args = {}
                dest_party_id = party_parameters.get('local',
                                                     {}).get('party_id')

                federated_api(job_id=job_id,
                              method='POST',
                              endpoint='/{}/job/{}/{}/{}/{}/{}/run'.format(
                                  API_VERSION, job_id, component_name, task_id,
                                  role, dest_party_id),
                              src_party_id=job_initiator['party_id'],
                              dest_party_id=dest_party_id,
                              json_body={
                                  'job_parameters': job_parameters,
                                  'job_initiator': job_initiator,
                                  'job_args': party_job_args,
                                  'parameters': party_parameters,
                                  'module_name': module_name,
                                  'input': component.get_input(),
                                  'output': component.get_output()
                              },
                              work_mode=job_parameters['work_mode'])
        component_task_status = TaskScheduler.check_task_status(
            job_id=job_id, component=component)
        if component_task_status:
            task_success = True
        else:
            task_success = False
        schedule_logger.info('job {} component {} run {}'.format(
            job_id, component_name, 'success' if task_success else 'failed'))
        # update progress
        TaskScheduler.sync_job_status(
            job_id=job_id,
            roles=job_runtime_conf['role'],
            work_mode=job_parameters['work_mode'],
            initiator_party_id=job_initiator['party_id'],
            job_info=job_utils.update_job_progress(
                job_id=job_id, dag=dag, current_task_id=task_id).to_json())
        if task_success:
            next_components = dag.get_next_components(component_name)
            schedule_logger.info(
                'job {} component {} next components is {}'.format(
                    job_id, component_name, [
                        next_component.get_name()
                        for next_component in next_components
                    ]))
            for next_component in next_components:
                try:
                    schedule_logger.info(
                        'job {} check component {} dependencies status'.format(
                            job_id, next_component.get_name()))
                    dependencies_status = TaskScheduler.check_dependencies(
                        job_id=job_id, dag=dag, component=next_component)
                    schedule_logger.info(
                        'job {} component {} dependencies status is {}'.format(
                            job_id, next_component.get_name(),
                            dependencies_status))
                    if dependencies_status:
                        run_status = TaskScheduler.run_component(
                            job_id, job_runtime_conf, job_parameters,
                            job_initiator, job_args, dag, next_component)
                    else:
                        run_status = False
                except Exception as e:
                    schedule_logger.info(e)
                    run_status = False
                if not run_status:
                    return False
            return True
        else:
            return False
Example #12
0
 def update_task_status(job_id, component_name, task_id, role, party_id, task_info):
     tracker = Tracking(job_id=job_id, role=role, party_id=party_id, component_name=component_name, task_id=task_id)
     tracker.save_task(role=role, party_id=party_id, task_info=task_info)
     schedule_logger.info(
         'job {} component {} {} {} status {}'.format(job_id, component_name, role, party_id,
                                                      task_info.get('f_status', '')))
Example #13
0
    def run_task():
        task = Task()
        task.f_create_time = current_timestamp()
        try:
            parser = argparse.ArgumentParser()
            parser.add_argument('-j',
                                '--job_id',
                                required=True,
                                type=str,
                                help="job id")
            parser.add_argument('-n',
                                '--component_name',
                                required=True,
                                type=str,
                                help="component name")
            parser.add_argument('-t',
                                '--task_id',
                                required=True,
                                type=str,
                                help="task id")
            parser.add_argument('-r',
                                '--role',
                                required=True,
                                type=str,
                                help="role")
            parser.add_argument('-p',
                                '--party_id',
                                required=True,
                                type=str,
                                help="party id")
            parser.add_argument('-c',
                                '--config',
                                required=True,
                                type=str,
                                help="task config")
            parser.add_argument('--job_server', help="job server", type=str)
            args = parser.parse_args()
            schedule_logger.info('enter task process')
            schedule_logger.info(args)
            # init function args
            if args.job_server:
                RuntimeConfig.init_config(
                    HTTP_PORT=args.job_server.split(':')[1])
            job_id = args.job_id
            component_name = args.component_name
            task_id = args.task_id
            role = args.role
            party_id = int(args.party_id)
            task_config = file_utils.load_json_conf(args.config)
            job_parameters = task_config['job_parameters']
            job_initiator = task_config['job_initiator']
            job_args = task_config['job_args']
            task_input_dsl = task_config['input']
            task_output_dsl = task_config['output']
            parameters = task_config['parameters']
            module_name = task_config['module_name']
        except Exception as e:
            schedule_logger.exception(e)
            task.f_status = TaskStatus.FAILED
            return
        try:
            # init environment, process is shared globally
            RuntimeConfig.init_config(WORK_MODE=job_parameters['work_mode'])
            storage.init_storage(job_id=task_id,
                                 work_mode=RuntimeConfig.WORK_MODE)
            federation.init(job_id=task_id, runtime_conf=parameters)
            job_log_dir = os.path.join(
                job_utils.get_job_log_directory(job_id=job_id), role,
                str(party_id))
            task_log_dir = os.path.join(job_log_dir, component_name)
            log_utils.LoggerFactory.set_directory(directory=task_log_dir,
                                                  parent_log_dir=job_log_dir,
                                                  append_to_parent_log=True,
                                                  force=True)

            task.f_job_id = job_id
            task.f_component_name = component_name
            task.f_task_id = task_id
            task.f_role = role
            task.f_party_id = party_id
            task.f_operator = 'python_operator'
            tracker = Tracking(job_id=job_id,
                               role=role,
                               party_id=party_id,
                               component_name=component_name,
                               task_id=task_id,
                               model_id=job_parameters['model_id'],
                               model_version=job_parameters['model_version'],
                               module_name=module_name)
            task.f_start_time = current_timestamp()
            task.f_run_ip = get_lan_ip()
            task.f_run_pid = os.getpid()
            run_class_paths = parameters.get('CodePath').split('/')
            run_class_package = '.'.join(
                run_class_paths[:-2]) + '.' + run_class_paths[-2].replace(
                    '.py', '')
            run_class_name = run_class_paths[-1]
            task_run_args = TaskExecutor.get_task_run_args(
                job_id=job_id,
                role=role,
                party_id=party_id,
                job_parameters=job_parameters,
                job_args=job_args,
                input_dsl=task_input_dsl)
            run_object = getattr(importlib.import_module(run_class_package),
                                 run_class_name)()
            run_object.set_tracker(tracker=tracker)
            run_object.set_taskid(taskid=task_id)
            task.f_status = TaskStatus.RUNNING
            TaskExecutor.sync_task_status(job_id=job_id,
                                          component_name=component_name,
                                          task_id=task_id,
                                          role=role,
                                          party_id=party_id,
                                          initiator_party_id=job_initiator.get(
                                              'party_id', None),
                                          task_info=task.to_json())

            schedule_logger.info('run {} {} {} {} {} task'.format(
                job_id, component_name, task_id, role, party_id))
            schedule_logger.info(parameters)
            schedule_logger.info(task_input_dsl)
            run_object.run(parameters, task_run_args)
            if task_output_dsl:
                if task_output_dsl.get('data', []):
                    output_data = run_object.save_data()
                    tracker.save_output_data_table(
                        output_data,
                        task_output_dsl.get('data')[0])
                if task_output_dsl.get('model', []):
                    output_model = run_object.export_model()
                    # There is only one model output at the current dsl version.
                    tracker.save_output_model(output_model,
                                              task_output_dsl['model'][0])
            task.f_status = TaskStatus.SUCCESS
        except Exception as e:
            schedule_logger.exception(e)
            task.f_status = TaskStatus.FAILED
        finally:
            try:
                task.f_end_time = current_timestamp()
                task.f_elapsed = task.f_end_time - task.f_start_time
                task.f_update_time = current_timestamp()
                TaskExecutor.sync_task_status(
                    job_id=job_id,
                    component_name=component_name,
                    task_id=task_id,
                    role=role,
                    party_id=party_id,
                    initiator_party_id=job_initiator.get('party_id', None),
                    task_info=task.to_json())
            except Exception as e:
                schedule_logger.exception(e)
        schedule_logger.info('finish {} {} {} {} {} {} task'.format(
            job_id, component_name, task_id, role, party_id, task.f_status))
        print('finish {} {} {} {} {} {} task'.format(job_id, component_name,
                                                     task_id, role, party_id,
                                                     task.f_status))