Esempio n. 1
0
    def clean_queue():
        schedule_logger().info('get clean queue command')
        jobs = job_utils.query_job(is_initiator=1, status=JobStatus.WAITING)
        if jobs:
            for job in jobs:
                schedule_logger(job.f_job_id).info(
                    'start send {} job {} command success'.format(JobStatus.CANCELED, job.f_job_id))
                job_info = {'f_job_id': job.f_job_id, 'f_status': JobStatus.CANCELED}
                roles = json_loads(job.f_roles)
                job_work_mode = job.f_work_mode
                initiator_party_id = job.f_party_id

                TaskScheduler.sync_job_status(job_id=job.f_job_id, roles=roles, initiator_party_id=initiator_party_id,
                                              initiator_role=job.f_role,
                                              work_mode=job_work_mode,
                                              job_info=job_info)
                job_runtime_conf = json_loads(job.f_runtime_conf)
                event = job_utils.job_event(job.f_job_id,
                                            job_runtime_conf['initiator']['role'],
                                            job_runtime_conf['initiator']['party_id'])
                try:
                    RuntimeConfig.JOB_QUEUE.del_event(event)
                    schedule_logger(job.f_job_id).info(
                        'send {} job {} command success'.format(JobStatus.CANCELED, job.f_job_id))
                except Exception as e:
                    schedule_logger(job.f_job_id).error(e)
        else:
            raise Exception('There are no jobs in the queue')
Esempio n. 2
0
def pipeline_dag_dependency(job_info):
    try:
        detect_utils.check_config(job_info,
                                  required_arguments=["party_id", "role"])
        if job_info.get('job_id'):
            jobs = job_utils.query_job(job_id=job_info["job_id"],
                                       party_id=job_info["party_id"],
                                       role=job_info["role"])
            if not jobs:
                raise Exception('query job {} failed'.format(
                    job_info.get('job_id', '')))
            job = jobs[0]
            job_dsl_parser = job_utils.get_job_dsl_parser(
                dsl=json_loads(job.f_dsl),
                runtime_conf=json_loads(job.f_runtime_conf),
                train_runtime_conf=json_loads(job.f_train_runtime_conf))
        else:
            job_dsl_parser = job_utils.get_job_dsl_parser(
                dsl=job_info.get('job_dsl', {}),
                runtime_conf=job_info.get('job_runtime_conf', {}),
                train_runtime_conf=job_info.get('job_train_runtime_conf', {}))
        return job_dsl_parser.get_dependency(role=job_info["role"],
                                             party_id=int(
                                                 job_info["party_id"]))
    except Exception as e:
        stat_logger.exception(e)
        raise e
Esempio n. 3
0
    def update_job_status(job_id, role, party_id, job_info, create=False):
        job_info['f_run_ip'] = RuntimeConfig.JOB_SERVER_HOST
        if create:
            dsl = json_loads(job_info['f_dsl'])
            runtime_conf = json_loads(job_info['f_runtime_conf'])
            train_runtime_conf = json_loads(job_info['f_train_runtime_conf'])
            if USE_AUTHENTICATION:
                authentication_check(src_role=job_info.get('src_role', None), src_party_id=job_info.get('src_party_id', None),
                                     dsl=dsl, runtime_conf=runtime_conf, role=role, party_id=party_id)
            save_job_conf(job_id=job_id,
                          job_dsl=dsl,
                          job_runtime_conf=runtime_conf,
                          train_runtime_conf=train_runtime_conf,
                          pipeline_dsl=None)

            job_parameters = runtime_conf['job_parameters']
            job_tracker = Tracking(job_id=job_id, role=role, party_id=party_id,
                                   model_id=job_parameters["model_id"],
                                   model_version=job_parameters["model_version"])
            if job_parameters.get("job_type", "") != "predict":
                job_tracker.init_pipelined_model()
            roles = json_loads(job_info['f_roles'])
            partner = {}
            show_role = {}
            is_initiator = job_info.get('f_is_initiator', 0)
            for _role, _role_party in roles.items():
                if is_initiator or _role == role:
                    show_role[_role] = show_role.get(_role, [])
                    for _party_id in _role_party:
                        if is_initiator or _party_id == party_id:
                            show_role[_role].append(_party_id)

                if _role != role:
                    partner[_role] = partner.get(_role, [])
                    partner[_role].extend(_role_party)
                else:
                    for _party_id in _role_party:
                        if _party_id != party_id:
                            partner[_role] = partner.get(_role, [])
                            partner[_role].append(_party_id)

            dag = get_job_dsl_parser(dsl=dsl,
                                     runtime_conf=runtime_conf,
                                     train_runtime_conf=train_runtime_conf)
            job_args = dag.get_args_input()
            dataset = {}
            for _role, _role_party_args in job_args.items():
                if is_initiator or _role == role:
                    for _party_index in range(len(_role_party_args)):
                        _party_id = roles[_role][_party_index]
                        if is_initiator or _party_id == party_id:
                            dataset[_role] = dataset.get(_role, {})
                            dataset[_role][_party_id] = dataset[_role].get(_party_id, {})
                            for _data_type, _data_location in _role_party_args[_party_index]['args']['data'].items():
                                dataset[_role][_party_id][_data_type] = '{}.{}'.format(_data_location['namespace'],
                                                                                       _data_location['name'])
            job_tracker.log_job_view({'partner': partner, 'dataset': dataset, 'roles': show_role})
        else:
            job_tracker = Tracking(job_id=job_id, role=role, party_id=party_id)
        job_tracker.save_job_info(role=role, party_id=party_id, job_info=job_info, create=create)
Esempio n. 4
0
def get_job_dsl_parser_by_job_id(job_id):
    with DB.connection_context():
        jobs = Job.select(
            Job.f_dsl, Job.f_runtime_conf,
            Job.f_train_runtime_conf).where(Job.f_job_id == job_id)
        if jobs:
            job = jobs[0]
            job_dsl_parser = get_job_dsl_parser(
                dsl=json_loads(job.f_dsl),
                runtime_conf=json_loads(job.f_runtime_conf),
                train_runtime_conf=json_loads(job.f_train_runtime_conf))
            return job_dsl_parser
        else:
            return None
Esempio n. 5
0
def job_config():
    jobs = job_utils.query_job(**request.json)
    if not jobs:
        return get_json_result(retcode=101, retmsg='find job failed')
    else:
        job = jobs[0]
        response_data = dict()
        response_data['job_id'] = job.f_job_id
        response_data['dsl'] = json_loads(job.f_dsl)
        response_data['runtime_conf'] = json_loads(job.f_runtime_conf)
        response_data['train_runtime_conf'] = json_loads(job.f_train_runtime_conf)
        response_data['model_info'] = {'model_id': response_data['runtime_conf']['job_parameters']['model_id'],
                                       'model_version': response_data['runtime_conf']['job_parameters'][
                                           'model_version']}
        return get_json_result(retcode=0, retmsg='success', data=response_data)
Esempio n. 6
0
 def get_data_table_meta(key, data_table_name, data_table_namespace):
     """
     get data table meta information
     :param key:
     :param data_table_name: table name of this data table
     :param data_table_namespace: table name of this data table
     :return:
     """
     from arch.api.utils.core_utils import json_loads
     data_meta_table = FateSession.get_instance().table(
         name="%s.meta" % data_table_name,
         namespace=data_table_namespace,
         create_if_missing=True,
         error_if_exist=False,
         in_place_computing=False,
         persistent=True,
         partition=1)
     if data_meta_table:
         value_bytes = data_meta_table.get(key, use_serialize=False)
         if value_bytes:
             return json_loads(value_bytes)
         else:
             return None
     else:
         return None
Esempio n. 7
0
    def kill_job(job_id, role, party_id, job_initiator, timeout=False, component_name=''):
        schedule_logger(job_id).info('{} {} get kill job {} {} command'.format(role, party_id, job_id, component_name))
        task_info = job_utils.get_task_info(job_id, role, party_id, component_name)
        tasks = job_utils.query_task(**task_info)
        job = job_utils.query_job(job_id=job_id)
        for task in tasks:
            kill_status = False
            try:
                # task clean up
                runtime_conf = json_loads(job[0].f_runtime_conf)
                roles = ','.join(runtime_conf['role'].keys())
                party_ids = ','.join([','.join([str(j) for j in i]) for i in runtime_conf['role'].values()])
                # Tracking(job_id=job_id, role=role, party_id=party_id, task_id=task.f_task_id).clean_task(roles, party_ids)
                # stop task
                kill_status = job_utils.kill_task_executor_process(task)
                # session stop
                job_utils.start_session_stop(task)
            except Exception as e:
                schedule_logger(job_id).exception(e)
            finally:
                schedule_logger(job_id).info(
                    'job {} component {} on {} {} process {} kill {}'.format(job_id, task.f_component_name, task.f_role,
                                                                             task.f_party_id, task.f_run_pid,
                                                                             'success' if kill_status else 'failed'))
            status = TaskStatus.FAILED if not timeout else TaskStatus.TIMEOUT

            if task.f_status != TaskStatus.COMPLETE:
                task.f_status = status
            try:
                TaskExecutor.sync_task_status(job_id=job_id, component_name=task.f_component_name, task_id=task.f_task_id,
                                              role=role,
                                              party_id=party_id, initiator_party_id=job_initiator.get('party_id', None),
                                              task_info=task.to_json(), initiator_role=job_initiator.get('role', None))
            except Exception as e:
                schedule_logger(job_id).exception(e)
Esempio n. 8
0
    def stop(job_id, end_status=JobStatus.FAILED, component_name=''):
        schedule_logger(job_id).info('get {} job {} {} command'.format("cancel" if end_status == JobStatus.CANCELED else "stop", job_id, component_name))
        jobs = job_utils.query_job(job_id=job_id, is_initiator=1)
        cancel_success = False
        is_cancel = (end_status == JobStatus.CANCELED)
        if jobs:
            initiator_job = jobs[0]
            job_info = {'f_job_id': job_id, 'f_status': end_status}
            roles = json_loads(initiator_job.f_roles)
            job_work_mode = initiator_job.f_work_mode
            initiator_party_id = initiator_job.f_party_id

            # set status first
            if not component_name:
                TaskScheduler.sync_job_status(job_id=job_id, roles=roles, initiator_party_id=initiator_party_id,
                                              initiator_role=initiator_job.f_role,
                                              work_mode=job_work_mode,
                                              job_info=job_info)
            for role, partys in roles.items():
                for party_id in partys:
                    response = federated_api(job_id=job_id,
                                             method='POST',
                                             endpoint='/{}/schedule/{}/{}/{}/{}'.format(
                                                 API_VERSION,
                                                 job_id,
                                                 role,
                                                 party_id,
                                                 "cancel" if is_cancel else "kill"
                                             ),
                                             src_party_id=initiator_party_id,
                                             dest_party_id=party_id,
                                             src_role=initiator_job.f_role,
                                             json_body={'job_initiator': {'party_id': initiator_job.f_party_id,
                                                                          'role': initiator_job.f_role},
                                                        'timeout': end_status == JobStatus.TIMEOUT,
                                                        'component_name': component_name
                                                        },
                                             work_mode=job_work_mode)
                    if response['retcode'] == 0:
                        cancel_success = True
                        schedule_logger(job_id).info(
                            'send {} {} {} job {} {} command successfully'.format(role, party_id, "cancel" if is_cancel else "kill",
                                                                                  job_id, component_name))
                        if is_cancel:
                            break
                    else:
                        schedule_logger(job_id).info(
                            'send {} {} {} job {} {} command failed: {}'.format(role, party_id, "cancel" if is_cancel else "kill",
                                                                                job_id, component_name, response['retmsg']))
            if is_cancel:
                return cancel_success
        else:
            jobs = job_utils.query_job(job_id=job_id)
            if jobs:
                raise Exception('Current role is not this job initiator')
            schedule_logger(job_id).info('send {} job {} {} command failed'.format("cancel" if is_cancel else "kill", job_id, component_name))
            raise Exception('can not found job: {}'.format(job_id))
Esempio n. 9
0
def get_commit_tmp(commit_id, data_table_namespace):
    version_tmp_table = get_commit_tmp_table(
        data_table_namespace=data_table_namespace)
    commit_tmp_info = version_tmp_table.get(commit_id, use_serialize=False)
    if commit_tmp_info:
        commit_tmp = json_loads(commit_tmp_info)
        return commit_tmp["tag"], commit_tmp["branch"]
    else:
        return None, "master"
Esempio n. 10
0
 def run_do(self):
     try:
         running_tasks = job_utils.query_task(status='running',
                                              run_ip=get_lan_ip())
         stop_job_ids = set()
         # detect_logger.info('start to detect running job..')
         for task in running_tasks:
             try:
                 process_exist = job_utils.check_job_process(
                     int(task.f_run_pid))
                 if not process_exist:
                     detect_logger.info(
                         'job {} component {} on {} {} task {} {} process does not exist'
                         .format(task.f_job_id, task.f_component_name,
                                 task.f_role, task.f_party_id,
                                 task.f_task_id, task.f_run_pid))
                     stop_job_ids.add(task.f_job_id)
             except Exception as e:
                 detect_logger.exception(e)
         if stop_job_ids:
             schedule_logger().info(
                 'start to stop jobs: {}'.format(stop_job_ids))
         for job_id in stop_job_ids:
             jobs = job_utils.query_job(job_id=job_id)
             if jobs:
                 initiator_party_id = jobs[0].f_initiator_party_id
                 job_work_mode = jobs[0].f_work_mode
                 if len(jobs) > 1:
                     # i am initiator
                     my_party_id = initiator_party_id
                 else:
                     my_party_id = jobs[0].f_party_id
                     initiator_party_id = jobs[0].f_initiator_party_id
                 api_utils.federated_api(
                     job_id=job_id,
                     method='POST',
                     endpoint='/{}/job/stop'.format(API_VERSION),
                     src_party_id=my_party_id,
                     dest_party_id=initiator_party_id,
                     src_role=None,
                     json_body={
                         'job_id': job_id,
                         'operate': 'kill'
                     },
                     work_mode=job_work_mode)
                 TaskScheduler.finish_job(job_id=job_id,
                                          job_runtime_conf=json_loads(
                                              jobs[0].f_runtime_conf),
                                          stop=True)
     except Exception as e:
         detect_logger.exception(e)
     finally:
         detect_logger.info('finish detect running job')
Esempio n. 11
0
def check_request_parameters(request_data):
    with DB.connection_context():
        if 'role' not in request_data and 'party_id' not in request_data:
            jobs = Job.select(Job.f_runtime_conf).where(Job.f_job_id == request_data.get('job_id', ''),
                                                        Job.f_is_initiator == 1)
            if jobs:
                job = jobs[0]
                job_runtime_conf = json_loads(job.f_runtime_conf)
                job_initiator = job_runtime_conf.get('initiator', {})
                role = job_initiator.get('role', '')
                party_id = job_initiator.get('party_id', 0)
                request_data['role'] = role
                request_data['party_id'] = party_id
Esempio n. 12
0
def get_job_configuration(job_id, role, party_id, tasks=None):
    with DB.connection_context():
        if tasks:
            jobs_run_conf = {}
            for task in tasks:
                jobs = Job.select(
                    Job.f_job_id, Job.f_runtime_conf,
                    Job.f_description).where(Job.f_job_id == task.f_job_id)
                job = jobs[0]
                jobs_run_conf[job.f_job_id] = json_loads(
                    job.f_runtime_conf)["role_parameters"]["local"]["upload_0"]
                jobs_run_conf[job.f_job_id]["notes"] = job.f_description
            return jobs_run_conf
        else:
            jobs = Job.select(Job.f_dsl, Job.f_runtime_conf,
                              Job.f_train_runtime_conf).where(
                                  Job.f_job_id == job_id, Job.f_role == role,
                                  Job.f_party_id == party_id)
        if jobs:
            job = jobs[0]
            return json_loads(job.f_dsl), json_loads(
                job.f_runtime_conf), json_loads(job.f_train_runtime_conf)
        else:
            return {}, {}, {}
Esempio n. 13
0
 def cancel_job(job_id, role, party_id, job_initiator):
     schedule_logger(job_id).info('{} {} get cancel waiting job {} command'.format(role, party_id, job_id))
     jobs = job_utils.query_job(job_id=job_id)
     if jobs:
         job = jobs[0]
         job_runtime_conf = json_loads(job.f_runtime_conf)
         event = job_utils.job_event(job.f_job_id,
                                     job_runtime_conf['initiator']['role'],
                                     job_runtime_conf['initiator']['party_id'])
         try:
             RuntimeConfig.JOB_QUEUE.del_event(event)
         except:
             return False
         schedule_logger(job_id).info('cancel waiting job successfully, job id is {}'.format(job.f_job_id))
         return True
     else:
         raise Exception('role {} party id {} cancel waiting job failed, no find jod {}'.format(role, party_id, job_id))
Esempio n. 14
0
 def get_data_table_metas(data_table_name, data_table_namespace):
     """
     get data table meta information
     :param data_table_name: table name of this data table
     :param data_table_namespace: table name of this data table
     :return:
     """
     from arch.api.utils.core_utils import json_loads
     data_meta_table = FateSession.get_instance().table(
         name="%s.meta" % data_table_name,
         namespace=data_table_namespace,
         partition=1,
         persistent=True,
         in_place_computing=False,
         create_if_missing=True,
         error_if_exist=False)
     if data_meta_table:
         metas = dict()
         for k, v in data_meta_table.collect(use_serialize=False):
             metas[k] = json_loads(v)
         return metas
     else:
         return None
Esempio n. 15
0
    def submit_job(job_data, job_id=None):
        if not job_id:
            job_id = generate_job_id()
        schedule_logger(job_id).info('submit job, job_id {}, body {}'.format(job_id, job_data))
        job_dsl = job_data.get('job_dsl', {})
        job_runtime_conf = job_data.get('job_runtime_conf', {})
        job_utils.check_pipeline_job_runtime_conf(job_runtime_conf)
        job_parameters = job_runtime_conf['job_parameters']
        job_initiator = job_runtime_conf['initiator']
        job_type = job_parameters.get('job_type', '')
        if job_type != 'predict':
            # generate job model info
            job_parameters['model_id'] = '#'.join([dtable_utils.all_party_key(job_runtime_conf['role']), 'model'])
            job_parameters['model_version'] = job_id
            train_runtime_conf = {}
        else:
            detect_utils.check_config(job_parameters, ['model_id', 'model_version'])
            # get inference dsl from pipeline model as job dsl
            job_tracker = Tracking(job_id=job_id, role=job_initiator['role'], party_id=job_initiator['party_id'],
                                   model_id=job_parameters['model_id'], model_version=job_parameters['model_version'])
            pipeline_model = job_tracker.get_output_model('pipeline')
            job_dsl = json_loads(pipeline_model['Pipeline'].inference_dsl)
            train_runtime_conf = json_loads(pipeline_model['Pipeline'].train_runtime_conf)
        path_dict = save_job_conf(job_id=job_id,
                                  job_dsl=job_dsl,
                                  job_runtime_conf=job_runtime_conf,
                                  train_runtime_conf=train_runtime_conf,
                                  pipeline_dsl=None)

        job = Job()
        job.f_job_id = job_id
        job.f_roles = json_dumps(job_runtime_conf['role'])
        job.f_work_mode = job_parameters['work_mode']
        job.f_initiator_party_id = job_initiator['party_id']
        job.f_dsl = json_dumps(job_dsl)
        job.f_runtime_conf = json_dumps(job_runtime_conf)
        job.f_train_runtime_conf = json_dumps(train_runtime_conf)
        job.f_run_ip = ''
        job.f_status = JobStatus.WAITING
        job.f_progress = 0
        job.f_create_time = current_timestamp()

        initiator_role = job_initiator['role']
        initiator_party_id = job_initiator['party_id']
        if initiator_party_id not in job_runtime_conf['role'][initiator_role]:
            schedule_logger(job_id).info("initiator party id error:{}".format(initiator_party_id))
            raise Exception("initiator party id error {}".format(initiator_party_id))

        get_job_dsl_parser(dsl=job_dsl,
                           runtime_conf=job_runtime_conf,
                           train_runtime_conf=train_runtime_conf)

        TaskScheduler.distribute_job(job=job, roles=job_runtime_conf['role'], job_initiator=job_initiator)

        # push into queue
        job_event = job_utils.job_event(job_id, initiator_role,  initiator_party_id)
        try:
            RuntimeConfig.JOB_QUEUE.put_event(job_event)
        except Exception as e:
            raise Exception('push job into queue failed')

        schedule_logger(job_id).info(
            'submit job successfully, job id is {}, model id is {}'.format(job.f_job_id, job_parameters['model_id']))
        board_url = BOARD_DASHBOARD_URL.format(job_id, job_initiator['role'], job_initiator['party_id'])
        logs_directory = get_job_log_directory(job_id)
        return job_id, path_dict['job_dsl_path'], path_dict['job_runtime_conf_path'], logs_directory, \
               {'model_id': job_parameters['model_id'],'model_version': job_parameters['model_version']}, board_url
Esempio n. 16
0
def get_version_info(version_table, commit_id):
    info = version_table.get(commit_id, use_serialize=False)
    if info:
        return json_loads(info)
    else:
        return dict()