def get_upload_history(): request_data = request.json if request_data.get('job_id'): tasks = job_utils.query_task(component_name='upload_0', status='success', job_id=request_data.get('job_id')) else: tasks = job_utils.query_task(component_name='upload_0', status='success') limit = request_data.get('limit') if not limit: tasks = tasks[-1::-1] else: tasks = tasks[-1:-limit - 1:-1] jobs_run_conf = get_job_configuration(None, None, None, tasks) return get_upload_info(jobs_run_conf)
def kill_job(job_id, role, party_id, job_initiator, timeout=False): schedule_logger(job_id).info('{} {} get kill job {} command'.format(role, party_id, job_id)) tasks = job_utils.query_task(job_id=job_id, role=role, party_id=party_id) for task in tasks: kill_status = False try: kill_status = job_utils.kill_process(int(task.f_run_pid)) except Exception as e: schedule_logger(job_id).exception(e) finally: schedule_logger(job_id).info( 'job {} component {} on {} {} process {} kill {}'.format(job_id, task.f_component_name, task.f_role, task.f_party_id, task.f_run_pid, 'success' if kill_status else 'failed')) status = TaskStatus.FAILED if not timeout else TaskStatus.TIMEOUT if task.f_status != TaskStatus.SUCCESS: task.f_status = status try: TaskExecutor.sync_task_status(job_id=job_id, component_name=task.f_component_name, task_id=task.f_task_id, role=role, party_id=party_id, initiator_party_id=job_initiator.get('party_id', None), task_info=task.to_json(), initiator_role=job_initiator.get('role', None)) except Exception as e: schedule_logger(job_id).exception(e)
def check_task_status(job_id, component, interval=1): task_id = job_utils.generate_task_id( job_id=job_id, component_name=component.get_name()) while True: try: status_collect = set() parameters = component.get_role_parameters() for _role, _partys_parameters in parameters.items(): for _party_parameters in _partys_parameters: _party_id = _party_parameters.get('local', {}).get('party_id') tasks = query_task(job_id=job_id, task_id=task_id, role=_role, party_id=_party_id) if tasks: task_status = tasks[0].f_status else: task_status = 'notRunning' schedule_logger.info( 'job {} component {} run on {} {} status is {}'. format(job_id, component.get_name(), _role, _party_id, task_status)) status_collect.add(task_status) if 'failed' in status_collect: return False elif len(status_collect) == 1 and 'success' in status_collect: return True else: time.sleep(interval) except Exception as e: schedule_logger.exception(e) return False
def query_task(): tasks = job_utils.query_task(**request.json) if not tasks: return get_json_result(retcode=101, retmsg='find task failed') return get_json_result(retcode=0, retmsg='success', data=[task.to_json() for task in tasks])
def kill_job(job_id, role, party_id, job_initiator, timeout=False, component_name=''): schedule_logger(job_id).info('{} {} get kill job {} {} command'.format(role, party_id, job_id, component_name)) task_info = job_utils.get_task_info(job_id, role, party_id, component_name) tasks = job_utils.query_task(**task_info) job = job_utils.query_job(job_id=job_id) for task in tasks: kill_status = False try: # task clean up runtime_conf = json_loads(job[0].f_runtime_conf) roles = ','.join(runtime_conf['role'].keys()) party_ids = ','.join([','.join([str(j) for j in i]) for i in runtime_conf['role'].values()]) # Tracking(job_id=job_id, role=role, party_id=party_id, task_id=task.f_task_id).clean_task(roles, party_ids) # stop task kill_status = job_utils.kill_task_executor_process(task) # session stop job_utils.start_session_stop(task) except Exception as e: schedule_logger(job_id).exception(e) finally: schedule_logger(job_id).info( 'job {} component {} on {} {} process {} kill {}'.format(job_id, task.f_component_name, task.f_role, task.f_party_id, task.f_run_pid, 'success' if kill_status else 'failed')) status = TaskStatus.FAILED if not timeout else TaskStatus.TIMEOUT if task.f_status != TaskStatus.COMPLETE: task.f_status = status try: TaskExecutor.sync_task_status(job_id=job_id, component_name=task.f_component_name, task_id=task.f_task_id, role=role, party_id=party_id, initiator_party_id=job_initiator.get('party_id', None), task_info=task.to_json(), initiator_role=job_initiator.get('role', None)) except Exception as e: schedule_logger(job_id).exception(e)
def run_do(self): try: running_tasks = job_utils.query_task(status='running', run_ip=get_lan_ip()) stop_job_ids = set() # detect_logger.info('start to detect running job..') for task in running_tasks: try: process_exist = job_utils.check_job_process( int(task.f_run_pid)) if not process_exist: detect_logger.info( 'job {} component {} on {} {} task {} {} process does not exist' .format(task.f_job_id, task.f_component_name, task.f_role, task.f_party_id, task.f_task_id, task.f_run_pid)) stop_job_ids.add(task.f_job_id) except Exception as e: detect_logger.exception(e) if stop_job_ids: schedule_logger().info( 'start to stop jobs: {}'.format(stop_job_ids)) for job_id in stop_job_ids: jobs = job_utils.query_job(job_id=job_id) if jobs: initiator_party_id = jobs[0].f_initiator_party_id job_work_mode = jobs[0].f_work_mode if len(jobs) > 1: # i am initiator my_party_id = initiator_party_id else: my_party_id = jobs[0].f_party_id initiator_party_id = jobs[0].f_initiator_party_id api_utils.federated_api( job_id=job_id, method='POST', endpoint='/{}/job/stop'.format(API_VERSION), src_party_id=my_party_id, dest_party_id=initiator_party_id, src_role=None, json_body={ 'job_id': job_id, 'operate': 'kill' }, work_mode=job_work_mode) TaskScheduler.finish_job(job_id=job_id, job_runtime_conf=json_loads( jobs[0].f_runtime_conf), stop=True) except Exception as e: detect_logger.exception(e) finally: detect_logger.info('finish detect running job')
def clean_job(job_id, role, party_id): schedule_logger.info('job {} on {} {} start to clean'.format(job_id, role, party_id)) tasks = job_utils.query_task(job_id=job_id, role=role, party_id=party_id) for task in tasks: try: Tracking(job_id=job_id, role=role, party_id=party_id, task_id=task.f_task_id).clean_task() schedule_logger.info( 'job {} component {} on {} {} clean done'.format(job_id, task.f_component_name, role, party_id)) except Exception as e: schedule_logger.info( 'job {} component {} on {} {} clean failed'.format(job_id, task.f_component_name, role, party_id)) schedule_logger.exception(e) schedule_logger.info('job {} on {} {} clean done'.format(job_id, role, party_id))
def run_do(self): try: running_tasks = job_utils.query_task(status='running', run_ip=get_lan_ip()) stop_job_ids = set() detect_logger.info('start to detect running job..') for task in running_tasks: try: process_exist = job_utils.check_job_process( int(task.f_run_pid)) if not process_exist: detect_logger.info( 'job {} component {} on {} {} task {} {} process does not exist' .format(task.f_job_id, task.f_component_name, task.f_role, task.f_party_id, task.f_task_id, task.f_run_pid)) stop_job_ids.add(task.f_job_id) except Exception as e: detect_logger.exception(e) if stop_job_ids: schedule_logger.info( 'start to stop jobs: {}'.format(stop_job_ids)) for job_id in stop_job_ids: jobs = job_utils.query_job(job_id=job_id) if jobs: initiator_party_id = jobs[0].f_initiator_party_id job_work_mode = jobs[0].f_work_mode if len(jobs) > 1: # i am initiator my_party_id = initiator_party_id else: my_party_id = jobs[0].f_party_id initiator_party_id = jobs[0].f_initiator_party_id api_utils.local_api( method='POST', endpoint='/{}/job/stop'.format(API_VERSION), json_body={'job_id': job_id}) schedule_logger.info( 'send stop job {} command'.format(job_id)) except Exception as e: detect_logger.exception(e) finally: detect_logger.info('finish detect running job')