def clean_queue(): schedule_logger().info('get clean queue command') jobs = job_utils.query_job(is_initiator=1, status=JobStatus.WAITING) if jobs: for job in jobs: schedule_logger(job.f_job_id).info( 'start send {} job {} command success'.format(JobStatus.CANCELED, job.f_job_id)) job_info = {'f_job_id': job.f_job_id, 'f_status': JobStatus.CANCELED} roles = json_loads(job.f_roles) job_work_mode = job.f_work_mode initiator_party_id = job.f_party_id TaskScheduler.sync_job_status(job_id=job.f_job_id, roles=roles, initiator_party_id=initiator_party_id, initiator_role=job.f_role, work_mode=job_work_mode, job_info=job_info) job_runtime_conf = json_loads(job.f_runtime_conf) event = job_utils.job_event(job.f_job_id, job_runtime_conf['initiator']['role'], job_runtime_conf['initiator']['party_id']) try: RuntimeConfig.JOB_QUEUE.del_event(event) schedule_logger(job.f_job_id).info( 'send {} job {} command success'.format(JobStatus.CANCELED, job.f_job_id)) except Exception as e: schedule_logger(job.f_job_id).error(e) else: raise Exception('There are no jobs in the queue')
def pipeline_dag_dependency(job_info): try: detect_utils.check_config(job_info, required_arguments=["party_id", "role"]) if job_info.get('job_id'): jobs = job_utils.query_job(job_id=job_info["job_id"], party_id=job_info["party_id"], role=job_info["role"]) if not jobs: raise Exception('query job {} failed'.format( job_info.get('job_id', ''))) job = jobs[0] job_dsl_parser = job_utils.get_job_dsl_parser( dsl=json_loads(job.f_dsl), runtime_conf=json_loads(job.f_runtime_conf), train_runtime_conf=json_loads(job.f_train_runtime_conf)) else: job_dsl_parser = job_utils.get_job_dsl_parser( dsl=job_info.get('job_dsl', {}), runtime_conf=job_info.get('job_runtime_conf', {}), train_runtime_conf=job_info.get('job_train_runtime_conf', {})) return job_dsl_parser.get_dependency(role=job_info["role"], party_id=int( job_info["party_id"])) except Exception as e: stat_logger.exception(e) raise e
def update_job_status(job_id, role, party_id, job_info, create=False): job_info['f_run_ip'] = RuntimeConfig.JOB_SERVER_HOST if create: dsl = json_loads(job_info['f_dsl']) runtime_conf = json_loads(job_info['f_runtime_conf']) train_runtime_conf = json_loads(job_info['f_train_runtime_conf']) if USE_AUTHENTICATION: authentication_check(src_role=job_info.get('src_role', None), src_party_id=job_info.get('src_party_id', None), dsl=dsl, runtime_conf=runtime_conf, role=role, party_id=party_id) save_job_conf(job_id=job_id, job_dsl=dsl, job_runtime_conf=runtime_conf, train_runtime_conf=train_runtime_conf, pipeline_dsl=None) job_parameters = runtime_conf['job_parameters'] job_tracker = Tracking(job_id=job_id, role=role, party_id=party_id, model_id=job_parameters["model_id"], model_version=job_parameters["model_version"]) if job_parameters.get("job_type", "") != "predict": job_tracker.init_pipelined_model() roles = json_loads(job_info['f_roles']) partner = {} show_role = {} is_initiator = job_info.get('f_is_initiator', 0) for _role, _role_party in roles.items(): if is_initiator or _role == role: show_role[_role] = show_role.get(_role, []) for _party_id in _role_party: if is_initiator or _party_id == party_id: show_role[_role].append(_party_id) if _role != role: partner[_role] = partner.get(_role, []) partner[_role].extend(_role_party) else: for _party_id in _role_party: if _party_id != party_id: partner[_role] = partner.get(_role, []) partner[_role].append(_party_id) dag = get_job_dsl_parser(dsl=dsl, runtime_conf=runtime_conf, train_runtime_conf=train_runtime_conf) job_args = dag.get_args_input() dataset = {} for _role, _role_party_args in job_args.items(): if is_initiator or _role == role: for _party_index in range(len(_role_party_args)): _party_id = roles[_role][_party_index] if is_initiator or _party_id == party_id: dataset[_role] = dataset.get(_role, {}) dataset[_role][_party_id] = dataset[_role].get(_party_id, {}) for _data_type, _data_location in _role_party_args[_party_index]['args']['data'].items(): dataset[_role][_party_id][_data_type] = '{}.{}'.format(_data_location['namespace'], _data_location['name']) job_tracker.log_job_view({'partner': partner, 'dataset': dataset, 'roles': show_role}) else: job_tracker = Tracking(job_id=job_id, role=role, party_id=party_id) job_tracker.save_job_info(role=role, party_id=party_id, job_info=job_info, create=create)
def get_job_dsl_parser_by_job_id(job_id): with DB.connection_context(): jobs = Job.select( Job.f_dsl, Job.f_runtime_conf, Job.f_train_runtime_conf).where(Job.f_job_id == job_id) if jobs: job = jobs[0] job_dsl_parser = get_job_dsl_parser( dsl=json_loads(job.f_dsl), runtime_conf=json_loads(job.f_runtime_conf), train_runtime_conf=json_loads(job.f_train_runtime_conf)) return job_dsl_parser else: return None
def job_config(): jobs = job_utils.query_job(**request.json) if not jobs: return get_json_result(retcode=101, retmsg='find job failed') else: job = jobs[0] response_data = dict() response_data['job_id'] = job.f_job_id response_data['dsl'] = json_loads(job.f_dsl) response_data['runtime_conf'] = json_loads(job.f_runtime_conf) response_data['train_runtime_conf'] = json_loads(job.f_train_runtime_conf) response_data['model_info'] = {'model_id': response_data['runtime_conf']['job_parameters']['model_id'], 'model_version': response_data['runtime_conf']['job_parameters'][ 'model_version']} return get_json_result(retcode=0, retmsg='success', data=response_data)
def get_data_table_meta(key, data_table_name, data_table_namespace): """ get data table meta information :param key: :param data_table_name: table name of this data table :param data_table_namespace: table name of this data table :return: """ from arch.api.utils.core_utils import json_loads data_meta_table = FateSession.get_instance().table( name="%s.meta" % data_table_name, namespace=data_table_namespace, create_if_missing=True, error_if_exist=False, in_place_computing=False, persistent=True, partition=1) if data_meta_table: value_bytes = data_meta_table.get(key, use_serialize=False) if value_bytes: return json_loads(value_bytes) else: return None else: return None
def kill_job(job_id, role, party_id, job_initiator, timeout=False, component_name=''): schedule_logger(job_id).info('{} {} get kill job {} {} command'.format(role, party_id, job_id, component_name)) task_info = job_utils.get_task_info(job_id, role, party_id, component_name) tasks = job_utils.query_task(**task_info) job = job_utils.query_job(job_id=job_id) for task in tasks: kill_status = False try: # task clean up runtime_conf = json_loads(job[0].f_runtime_conf) roles = ','.join(runtime_conf['role'].keys()) party_ids = ','.join([','.join([str(j) for j in i]) for i in runtime_conf['role'].values()]) # Tracking(job_id=job_id, role=role, party_id=party_id, task_id=task.f_task_id).clean_task(roles, party_ids) # stop task kill_status = job_utils.kill_task_executor_process(task) # session stop job_utils.start_session_stop(task) except Exception as e: schedule_logger(job_id).exception(e) finally: schedule_logger(job_id).info( 'job {} component {} on {} {} process {} kill {}'.format(job_id, task.f_component_name, task.f_role, task.f_party_id, task.f_run_pid, 'success' if kill_status else 'failed')) status = TaskStatus.FAILED if not timeout else TaskStatus.TIMEOUT if task.f_status != TaskStatus.COMPLETE: task.f_status = status try: TaskExecutor.sync_task_status(job_id=job_id, component_name=task.f_component_name, task_id=task.f_task_id, role=role, party_id=party_id, initiator_party_id=job_initiator.get('party_id', None), task_info=task.to_json(), initiator_role=job_initiator.get('role', None)) except Exception as e: schedule_logger(job_id).exception(e)
def stop(job_id, end_status=JobStatus.FAILED, component_name=''): schedule_logger(job_id).info('get {} job {} {} command'.format("cancel" if end_status == JobStatus.CANCELED else "stop", job_id, component_name)) jobs = job_utils.query_job(job_id=job_id, is_initiator=1) cancel_success = False is_cancel = (end_status == JobStatus.CANCELED) if jobs: initiator_job = jobs[0] job_info = {'f_job_id': job_id, 'f_status': end_status} roles = json_loads(initiator_job.f_roles) job_work_mode = initiator_job.f_work_mode initiator_party_id = initiator_job.f_party_id # set status first if not component_name: TaskScheduler.sync_job_status(job_id=job_id, roles=roles, initiator_party_id=initiator_party_id, initiator_role=initiator_job.f_role, work_mode=job_work_mode, job_info=job_info) for role, partys in roles.items(): for party_id in partys: response = federated_api(job_id=job_id, method='POST', endpoint='/{}/schedule/{}/{}/{}/{}'.format( API_VERSION, job_id, role, party_id, "cancel" if is_cancel else "kill" ), src_party_id=initiator_party_id, dest_party_id=party_id, src_role=initiator_job.f_role, json_body={'job_initiator': {'party_id': initiator_job.f_party_id, 'role': initiator_job.f_role}, 'timeout': end_status == JobStatus.TIMEOUT, 'component_name': component_name }, work_mode=job_work_mode) if response['retcode'] == 0: cancel_success = True schedule_logger(job_id).info( 'send {} {} {} job {} {} command successfully'.format(role, party_id, "cancel" if is_cancel else "kill", job_id, component_name)) if is_cancel: break else: schedule_logger(job_id).info( 'send {} {} {} job {} {} command failed: {}'.format(role, party_id, "cancel" if is_cancel else "kill", job_id, component_name, response['retmsg'])) if is_cancel: return cancel_success else: jobs = job_utils.query_job(job_id=job_id) if jobs: raise Exception('Current role is not this job initiator') schedule_logger(job_id).info('send {} job {} {} command failed'.format("cancel" if is_cancel else "kill", job_id, component_name)) raise Exception('can not found job: {}'.format(job_id))
def get_commit_tmp(commit_id, data_table_namespace): version_tmp_table = get_commit_tmp_table( data_table_namespace=data_table_namespace) commit_tmp_info = version_tmp_table.get(commit_id, use_serialize=False) if commit_tmp_info: commit_tmp = json_loads(commit_tmp_info) return commit_tmp["tag"], commit_tmp["branch"] else: return None, "master"
def run_do(self): try: running_tasks = job_utils.query_task(status='running', run_ip=get_lan_ip()) stop_job_ids = set() # detect_logger.info('start to detect running job..') for task in running_tasks: try: process_exist = job_utils.check_job_process( int(task.f_run_pid)) if not process_exist: detect_logger.info( 'job {} component {} on {} {} task {} {} process does not exist' .format(task.f_job_id, task.f_component_name, task.f_role, task.f_party_id, task.f_task_id, task.f_run_pid)) stop_job_ids.add(task.f_job_id) except Exception as e: detect_logger.exception(e) if stop_job_ids: schedule_logger().info( 'start to stop jobs: {}'.format(stop_job_ids)) for job_id in stop_job_ids: jobs = job_utils.query_job(job_id=job_id) if jobs: initiator_party_id = jobs[0].f_initiator_party_id job_work_mode = jobs[0].f_work_mode if len(jobs) > 1: # i am initiator my_party_id = initiator_party_id else: my_party_id = jobs[0].f_party_id initiator_party_id = jobs[0].f_initiator_party_id api_utils.federated_api( job_id=job_id, method='POST', endpoint='/{}/job/stop'.format(API_VERSION), src_party_id=my_party_id, dest_party_id=initiator_party_id, src_role=None, json_body={ 'job_id': job_id, 'operate': 'kill' }, work_mode=job_work_mode) TaskScheduler.finish_job(job_id=job_id, job_runtime_conf=json_loads( jobs[0].f_runtime_conf), stop=True) except Exception as e: detect_logger.exception(e) finally: detect_logger.info('finish detect running job')
def check_request_parameters(request_data): with DB.connection_context(): if 'role' not in request_data and 'party_id' not in request_data: jobs = Job.select(Job.f_runtime_conf).where(Job.f_job_id == request_data.get('job_id', ''), Job.f_is_initiator == 1) if jobs: job = jobs[0] job_runtime_conf = json_loads(job.f_runtime_conf) job_initiator = job_runtime_conf.get('initiator', {}) role = job_initiator.get('role', '') party_id = job_initiator.get('party_id', 0) request_data['role'] = role request_data['party_id'] = party_id
def get_job_configuration(job_id, role, party_id, tasks=None): with DB.connection_context(): if tasks: jobs_run_conf = {} for task in tasks: jobs = Job.select( Job.f_job_id, Job.f_runtime_conf, Job.f_description).where(Job.f_job_id == task.f_job_id) job = jobs[0] jobs_run_conf[job.f_job_id] = json_loads( job.f_runtime_conf)["role_parameters"]["local"]["upload_0"] jobs_run_conf[job.f_job_id]["notes"] = job.f_description return jobs_run_conf else: jobs = Job.select(Job.f_dsl, Job.f_runtime_conf, Job.f_train_runtime_conf).where( Job.f_job_id == job_id, Job.f_role == role, Job.f_party_id == party_id) if jobs: job = jobs[0] return json_loads(job.f_dsl), json_loads( job.f_runtime_conf), json_loads(job.f_train_runtime_conf) else: return {}, {}, {}
def cancel_job(job_id, role, party_id, job_initiator): schedule_logger(job_id).info('{} {} get cancel waiting job {} command'.format(role, party_id, job_id)) jobs = job_utils.query_job(job_id=job_id) if jobs: job = jobs[0] job_runtime_conf = json_loads(job.f_runtime_conf) event = job_utils.job_event(job.f_job_id, job_runtime_conf['initiator']['role'], job_runtime_conf['initiator']['party_id']) try: RuntimeConfig.JOB_QUEUE.del_event(event) except: return False schedule_logger(job_id).info('cancel waiting job successfully, job id is {}'.format(job.f_job_id)) return True else: raise Exception('role {} party id {} cancel waiting job failed, no find jod {}'.format(role, party_id, job_id))
def get_data_table_metas(data_table_name, data_table_namespace): """ get data table meta information :param data_table_name: table name of this data table :param data_table_namespace: table name of this data table :return: """ from arch.api.utils.core_utils import json_loads data_meta_table = FateSession.get_instance().table( name="%s.meta" % data_table_name, namespace=data_table_namespace, partition=1, persistent=True, in_place_computing=False, create_if_missing=True, error_if_exist=False) if data_meta_table: metas = dict() for k, v in data_meta_table.collect(use_serialize=False): metas[k] = json_loads(v) return metas else: return None
def submit_job(job_data, job_id=None): if not job_id: job_id = generate_job_id() schedule_logger(job_id).info('submit job, job_id {}, body {}'.format(job_id, job_data)) job_dsl = job_data.get('job_dsl', {}) job_runtime_conf = job_data.get('job_runtime_conf', {}) job_utils.check_pipeline_job_runtime_conf(job_runtime_conf) job_parameters = job_runtime_conf['job_parameters'] job_initiator = job_runtime_conf['initiator'] job_type = job_parameters.get('job_type', '') if job_type != 'predict': # generate job model info job_parameters['model_id'] = '#'.join([dtable_utils.all_party_key(job_runtime_conf['role']), 'model']) job_parameters['model_version'] = job_id train_runtime_conf = {} else: detect_utils.check_config(job_parameters, ['model_id', 'model_version']) # get inference dsl from pipeline model as job dsl job_tracker = Tracking(job_id=job_id, role=job_initiator['role'], party_id=job_initiator['party_id'], model_id=job_parameters['model_id'], model_version=job_parameters['model_version']) pipeline_model = job_tracker.get_output_model('pipeline') job_dsl = json_loads(pipeline_model['Pipeline'].inference_dsl) train_runtime_conf = json_loads(pipeline_model['Pipeline'].train_runtime_conf) path_dict = save_job_conf(job_id=job_id, job_dsl=job_dsl, job_runtime_conf=job_runtime_conf, train_runtime_conf=train_runtime_conf, pipeline_dsl=None) job = Job() job.f_job_id = job_id job.f_roles = json_dumps(job_runtime_conf['role']) job.f_work_mode = job_parameters['work_mode'] job.f_initiator_party_id = job_initiator['party_id'] job.f_dsl = json_dumps(job_dsl) job.f_runtime_conf = json_dumps(job_runtime_conf) job.f_train_runtime_conf = json_dumps(train_runtime_conf) job.f_run_ip = '' job.f_status = JobStatus.WAITING job.f_progress = 0 job.f_create_time = current_timestamp() initiator_role = job_initiator['role'] initiator_party_id = job_initiator['party_id'] if initiator_party_id not in job_runtime_conf['role'][initiator_role]: schedule_logger(job_id).info("initiator party id error:{}".format(initiator_party_id)) raise Exception("initiator party id error {}".format(initiator_party_id)) get_job_dsl_parser(dsl=job_dsl, runtime_conf=job_runtime_conf, train_runtime_conf=train_runtime_conf) TaskScheduler.distribute_job(job=job, roles=job_runtime_conf['role'], job_initiator=job_initiator) # push into queue job_event = job_utils.job_event(job_id, initiator_role, initiator_party_id) try: RuntimeConfig.JOB_QUEUE.put_event(job_event) except Exception as e: raise Exception('push job into queue failed') schedule_logger(job_id).info( 'submit job successfully, job id is {}, model id is {}'.format(job.f_job_id, job_parameters['model_id'])) board_url = BOARD_DASHBOARD_URL.format(job_id, job_initiator['role'], job_initiator['party_id']) logs_directory = get_job_log_directory(job_id) return job_id, path_dict['job_dsl_path'], path_dict['job_runtime_conf_path'], logs_directory, \ {'model_id': job_parameters['model_id'],'model_version': job_parameters['model_version']}, board_url
def get_version_info(version_table, commit_id): info = version_table.get(commit_id, use_serialize=False) if info: return json_loads(info) else: return dict()