def kill(self, task): kill_status_code = process_utils.kill_task_executor_process(task) # session stop if kill_status_code is KillProcessRetCode.KILLED or task.f_status not in { TaskStatus.WAITING }: job_utils.start_session_stop(task)
def kill_job(job_id, role, party_id, job_initiator, timeout=False, component_name=''): schedule_logger(job_id).info('{} {} get kill job {} {} command'.format(role, party_id, job_id, component_name)) task_info = job_utils.get_task_info(job_id, role, party_id, component_name) tasks = job_utils.query_task(**task_info) job = job_utils.query_job(job_id=job_id) for task in tasks: kill_status = False try: # task clean up runtime_conf = json_loads(job[0].f_runtime_conf) roles = ','.join(runtime_conf['role'].keys()) party_ids = ','.join([','.join([str(j) for j in i]) for i in runtime_conf['role'].values()]) # Tracking(job_id=job_id, role=role, party_id=party_id, task_id=task.f_task_id).clean_task(roles, party_ids) # stop task kill_status = job_utils.kill_task_executor_process(task) # session stop job_utils.start_session_stop(task) except Exception as e: schedule_logger(job_id).exception(e) finally: schedule_logger(job_id).info( 'job {} component {} on {} {} process {} kill {}'.format(job_id, task.f_component_name, task.f_role, task.f_party_id, task.f_run_pid, 'success' if kill_status else 'failed')) status = TaskStatus.FAILED if not timeout else TaskStatus.TIMEOUT if task.f_status != TaskStatus.COMPLETE: task.f_status = status try: TaskExecutor.sync_task_status(job_id=job_id, component_name=task.f_component_name, task_id=task.f_task_id, role=role, party_id=party_id, initiator_party_id=job_initiator.get('party_id', None), task_info=task.to_json(), initiator_role=job_initiator.get('role', None)) except Exception as e: schedule_logger(job_id).exception(e)
def start_clean_job(cls, **kwargs): tasks = JobSaver.query_task(**kwargs) if tasks: for task in tasks: try: # clean session stat_logger.info('start {} {} {} {} session stop'.format( task.f_job_id, task.f_role, task.f_party_id, task.f_component_name)) start_session_stop(task) stat_logger.info('stop {} {} {} {} session success'.format( task.f_job_id, task.f_role, task.f_party_id, task.f_component_name)) except Exception as e: pass try: # clean data table JobClean.clean_table(job_id=task.f_job_id, role=task.f_role, party_id=task.f_party_id, component_name=task.f_component_name) except Exception as e: stat_logger.info( 'delete {} {} {} {} data table failed'.format( task.f_job_id, task.f_role, task.f_party_id, task.f_component_name)) stat_logger.exception(e) try: # clean metric data stat_logger.info( 'start delete {} {} {} {} metric data'.format( task.f_job_id, task.f_role, task.f_party_id, task.f_component_name)) delete_metric_data({ 'job_id': task.f_job_id, 'role': task.f_role, 'party_id': task.f_party_id, 'component_name': task.f_component_name }) stat_logger.info( 'delete {} {} {} {} metric data success'.format( task.f_job_id, task.f_role, task.f_party_id, task.f_component_name)) except Exception as e: stat_logger.info( 'delete {} {} {} {} metric data failed'.format( task.f_job_id, task.f_role, task.f_party_id, task.f_component_name)) stat_logger.exception(e) else: raise Exception('no found task')
def kill_task(cls, task: Task): kill_status = False try: # kill task executor kill_status_code = job_utils.kill_task_executor_process(task) # session stop if kill_status_code == KillProcessStatusCode.KILLED or task.f_status not in { TaskStatus.WAITING }: job_utils.start_session_stop(task) except Exception as e: schedule_logger(task.f_job_id).exception(e) else: kill_status = True finally: schedule_logger(task.f_job_id).info( 'job {} task {} {} on {} {} process {} kill {}'.format( task.f_job_id, task.f_task_id, task.f_task_version, task.f_role, task.f_party_id, task.f_run_pid, 'success' if kill_status else 'failed')) return kill_status
def kill_job(job_id, role, party_id, job_initiator, timeout=False, component_name=''): schedule_logger(job_id).info('{} {} get kill job {} {} command'.format( role, party_id, job_id, component_name)) task_info = job_utils.get_task_info(job_id, role, party_id, component_name) tasks = job_utils.query_task(**task_info) for task in tasks: kill_status = False try: kill_status = job_utils.kill_process(int(task.f_run_pid)) job_utils.start_session_stop(task) except Exception as e: schedule_logger(job_id).exception(e) finally: schedule_logger(job_id).info( 'job {} component {} on {} {} process {} kill {}'.format( job_id, task.f_component_name, task.f_role, task.f_party_id, task.f_run_pid, 'success' if kill_status else 'failed')) status = TaskStatus.FAILED if not timeout else TaskStatus.TIMEOUT if task.f_status != TaskStatus.COMPLETE: task.f_status = status try: TaskExecutor.sync_task_status( job_id=job_id, component_name=task.f_component_name, task_id=task.f_task_id, role=role, party_id=party_id, initiator_party_id=job_initiator.get('party_id', None), task_info=task.to_json(), initiator_role=job_initiator.get('role', None)) except Exception as e: schedule_logger(job_id).exception(e)
def detect_expired_session(cls): sessions_record = StorageSessionBase.query_expired_sessions_record( ttl=30 * 60 * 1000) for session_record in sessions_record: job_utils.start_session_stop()