def get_lock_if_job_is_runnable(app_name, job_id): """Return a lock instance or False. If returning False, the job is not ready to execute. """ available = qb.check_state( app_name, job_id, pending=True, raise_if_not_exists=True) if not available: try: raise RuntimeError( "I found a job in queue that wasn't" " in state pending. This might be a code bug. You" " probably queued 2+ of the same job!") except RuntimeError as err: # force a traceback in the logs log.exception( err, extra=dict( app_name=app_name, job_id=job_id, state=qb.check_state( app_name, job_id, _get=True))) return False l = qb.obtain_execute_lock(app_name, job_id, blocking=False) if l is False: log.warn('Could not obtain execute lock for task because' ' something is already processing this job_id', extra=dict(app_name=app_name, job_id=job_id)) return False return l
def get_lock_if_job_is_runnable(app_name, job_id): """Return a lock instance or False. If returning False, the job is not ready to execute. """ available = qb.check_state(app_name, job_id, pending=True, raise_if_not_exists=True) if not available: try: raise RuntimeError( "I found a job in queue that wasn't" " in state pending. This might be a code bug. You" " probably queued 2+ of the same job!") except RuntimeError as err: # force a traceback in the logs log.exception(err, extra=dict(app_name=app_name, job_id=job_id, state=qb.check_state(app_name, job_id, _get=True))) return False l = qb.obtain_execute_lock(app_name, job_id, blocking=False) if l is False: log.warn( 'Could not obtain execute lock for task because' ' something is already processing this job_id', extra=dict(app_name=app_name, job_id=job_id)) return False return l
def main(ns): """ Fetch a job_id from the `app_name` queue and figure out what to with it. If the job is runnable, execute it and then queue its children into their respective queues. If it's not runnable, queue its parents into respective parent queues and remove the job from its own queue. If the job fails, either requeue it or mark it as permanently failed """ assert ns.app_name in dt.get_task_names() if ns.bypass_scheduler: log.info( "Running a task without scheduling anything" " or fetching from a queue", extra=dict( app_name=ns.app_name, job_id=ns.job_id)) assert ns.job_id ns.job_type_func(ns=ns) return log.info("Beginning Stolos", extra=dict(**ns.__dict__)) q = qb.get_qbclient().LockingQueue(ns.app_name) if ns.job_id: lock = _handle_manually_given_job_id(ns) q.consume = object # do nothing else: ns.job_id = q.get(timeout=ns.timeout) if not validate_job_id(app_name=ns.app_name, job_id=ns.job_id, q=q, timeout=ns.timeout): return try: lock = get_lock_if_job_is_runnable( app_name=ns.app_name, job_id=ns.job_id) except exceptions.NoNodeError: q.consume() log.exception( "Job failed. The job is queued, so why does its state not" " exist? The Queue backend may be in an inconsistent state." " Consuming this job", extra=dict(app_name=ns.app_name, job_id=ns.job_id)) return log.debug( "Stolos got a job_id.", extra=dict( app_name=ns.app_name, job_id=ns.job_id, acquired_lock=bool(lock))) if lock is False: # infinite loop: some jobs will always requeue if lock is unobtainable log.info("Could not obtain a lock. Will requeue and try again later", extra=dict(app_name=ns.app_name, job_id=ns.job_id)) _send_to_back_of_queue( q=q, app_name=ns.app_name, job_id=ns.job_id) return if not parents_completed(ns.app_name, ns.job_id, q=q, lock=lock): return log.info( "Job starting!", extra=dict(app_name=ns.app_name, job_id=ns.job_id)) try: ns.job_type_func(ns=ns) except exceptions.CodeError: # assume error is previously logged _handle_failure(ns, q, lock) return except Exception as err: log.exception( ("Job failed! Unhandled exception in an application!" " Fix ASAP because" " it is unclear how to handle this failure. %s: %s") % (err.__class__.__name__, err), extra=dict( app_name=ns.app_name, job_id=ns.job_id, failed=True)) return _handle_success(ns, q, lock)
def main(ns): """ Fetch a job_id from the `app_name` queue and figure out what to with it. If the job is runnable, execute it and then queue its children into their respective queues. If it's not runnable, queue its parents into respective parent queues and remove the job from its own queue. If the job fails, either requeue it or mark it as permanently failed """ assert ns.app_name in dt.get_task_names() if ns.bypass_scheduler: log.info( "Running a task without scheduling anything" " or fetching from a queue", extra=dict(app_name=ns.app_name, job_id=ns.job_id)) assert ns.job_id ns.job_type_func(ns=ns) return log.info("Beginning Stolos", extra=dict(**ns.__dict__)) q = qb.get_qbclient().LockingQueue(ns.app_name) if ns.job_id: lock = _handle_manually_given_job_id(ns) q.consume = object # do nothing else: ns.job_id = q.get(timeout=ns.timeout) if not validate_job_id( app_name=ns.app_name, job_id=ns.job_id, q=q, timeout=ns.timeout): return try: lock = get_lock_if_job_is_runnable(app_name=ns.app_name, job_id=ns.job_id) except exceptions.NoNodeError: q.consume() log.exception( "Job failed. The job is queued, so why does its state not" " exist? The Queue backend may be in an inconsistent state." " Consuming this job", extra=dict(app_name=ns.app_name, job_id=ns.job_id)) return log.debug("Stolos got a job_id.", extra=dict(app_name=ns.app_name, job_id=ns.job_id, acquired_lock=bool(lock))) if lock is False: # infinite loop: some jobs will always requeue if lock is unobtainable log.info("Could not obtain a lock. Will requeue and try again later", extra=dict(app_name=ns.app_name, job_id=ns.job_id)) _send_to_back_of_queue(q=q, app_name=ns.app_name, job_id=ns.job_id) return if not parents_completed(ns.app_name, ns.job_id, q=q, lock=lock): return log.info("Job starting!", extra=dict(app_name=ns.app_name, job_id=ns.job_id)) try: ns.job_type_func(ns=ns) except exceptions.CodeError: # assume error is previously logged _handle_failure(ns, q, lock) return except Exception as err: log.exception(("Job failed! Unhandled exception in an application!" " Fix ASAP because" " it is unclear how to handle this failure. %s: %s") % (err.__class__.__name__, err), extra=dict(app_name=ns.app_name, job_id=ns.job_id, failed=True)) return _handle_success(ns, q, lock)