Example #1
0
def get_lock_if_job_is_runnable(app_name, job_id):
    """Return a lock instance or False.  If returning False,
    the job is not ready to execute.
    """

    available = qb.check_state(
        app_name, job_id, pending=True, raise_if_not_exists=True)
    if not available:
        try:
            raise RuntimeError(
                "I found a job in queue that wasn't"
                " in state pending. This might be a code bug. You"
                " probably queued 2+ of the same job!")
        except RuntimeError as err:
            # force a traceback in the logs
            log.exception(
                err, extra=dict(
                    app_name=app_name,
                    job_id=job_id,
                    state=qb.check_state(
                        app_name, job_id, _get=True)))
            return False
    l = qb.obtain_execute_lock(app_name, job_id, blocking=False)
    if l is False:
        log.warn('Could not obtain execute lock for task because'
                 ' something is already processing this job_id',
                 extra=dict(app_name=app_name, job_id=job_id))
        return False
    return l
Example #2
0
def get_lock_if_job_is_runnable(app_name, job_id):
    """Return a lock instance or False.  If returning False,
    the job is not ready to execute.
    """

    available = qb.check_state(app_name,
                               job_id,
                               pending=True,
                               raise_if_not_exists=True)
    if not available:
        try:
            raise RuntimeError(
                "I found a job in queue that wasn't"
                " in state pending. This might be a code bug. You"
                " probably queued 2+ of the same job!")
        except RuntimeError as err:
            # force a traceback in the logs
            log.exception(err,
                          extra=dict(app_name=app_name,
                                     job_id=job_id,
                                     state=qb.check_state(app_name,
                                                          job_id,
                                                          _get=True)))
            return False
    l = qb.obtain_execute_lock(app_name, job_id, blocking=False)
    if l is False:
        log.warn(
            'Could not obtain execute lock for task because'
            ' something is already processing this job_id',
            extra=dict(app_name=app_name, job_id=job_id))
        return False
    return l
Example #3
0
def main(ns):
    """
    Fetch a job_id from the `app_name` queue and figure out what to with it.

    If the job is runnable, execute it and then queue its children into their
    respective queues.  If it's not runnable, queue its parents into respective
    parent queues and remove the job from its own queue.
    If the job fails, either requeue it or mark it as permanently failed
    """
    assert ns.app_name in dt.get_task_names()
    if ns.bypass_scheduler:
        log.info(
            "Running a task without scheduling anything"
            " or fetching from a queue", extra=dict(
                app_name=ns.app_name, job_id=ns.job_id))
        assert ns.job_id
        ns.job_type_func(ns=ns)
        return

    log.info("Beginning Stolos", extra=dict(**ns.__dict__))
    q = qb.get_qbclient().LockingQueue(ns.app_name)
    if ns.job_id:
        lock = _handle_manually_given_job_id(ns)
        q.consume = object  # do nothing
    else:
        ns.job_id = q.get(timeout=ns.timeout)
        if not validate_job_id(app_name=ns.app_name, job_id=ns.job_id,
                               q=q, timeout=ns.timeout):
            return
        try:
            lock = get_lock_if_job_is_runnable(
                app_name=ns.app_name, job_id=ns.job_id)
        except exceptions.NoNodeError:
            q.consume()
            log.exception(
                "Job failed. The job is queued, so why does its state not"
                " exist?  The Queue backend may be in an inconsistent state."
                " Consuming this job",
                extra=dict(app_name=ns.app_name, job_id=ns.job_id))
            return

    log.debug(
        "Stolos got a job_id.", extra=dict(
            app_name=ns.app_name, job_id=ns.job_id, acquired_lock=bool(lock)))
    if lock is False:
        # infinite loop: some jobs will always requeue if lock is unobtainable
        log.info("Could not obtain a lock.  Will requeue and try again later",
                 extra=dict(app_name=ns.app_name, job_id=ns.job_id))
        _send_to_back_of_queue(
            q=q, app_name=ns.app_name, job_id=ns.job_id)
        return

    if not parents_completed(ns.app_name, ns.job_id, q=q, lock=lock):
        return

    log.info(
        "Job starting!", extra=dict(app_name=ns.app_name, job_id=ns.job_id))
    try:
        ns.job_type_func(ns=ns)
    except exceptions.CodeError:  # assume error is previously logged
        _handle_failure(ns, q, lock)
        return
    except Exception as err:
        log.exception(
            ("Job failed!  Unhandled exception in an application!"
             " Fix ASAP because"
             " it is unclear how to handle this failure.  %s: %s")
            % (err.__class__.__name__, err), extra=dict(
                app_name=ns.app_name, job_id=ns.job_id, failed=True))
        return
    _handle_success(ns, q, lock)
Example #4
0
def main(ns):
    """
    Fetch a job_id from the `app_name` queue and figure out what to with it.

    If the job is runnable, execute it and then queue its children into their
    respective queues.  If it's not runnable, queue its parents into respective
    parent queues and remove the job from its own queue.
    If the job fails, either requeue it or mark it as permanently failed
    """
    assert ns.app_name in dt.get_task_names()
    if ns.bypass_scheduler:
        log.info(
            "Running a task without scheduling anything"
            " or fetching from a queue",
            extra=dict(app_name=ns.app_name, job_id=ns.job_id))
        assert ns.job_id
        ns.job_type_func(ns=ns)
        return

    log.info("Beginning Stolos", extra=dict(**ns.__dict__))
    q = qb.get_qbclient().LockingQueue(ns.app_name)
    if ns.job_id:
        lock = _handle_manually_given_job_id(ns)
        q.consume = object  # do nothing
    else:
        ns.job_id = q.get(timeout=ns.timeout)
        if not validate_job_id(
                app_name=ns.app_name, job_id=ns.job_id, q=q,
                timeout=ns.timeout):
            return
        try:
            lock = get_lock_if_job_is_runnable(app_name=ns.app_name,
                                               job_id=ns.job_id)
        except exceptions.NoNodeError:
            q.consume()
            log.exception(
                "Job failed. The job is queued, so why does its state not"
                " exist?  The Queue backend may be in an inconsistent state."
                " Consuming this job",
                extra=dict(app_name=ns.app_name, job_id=ns.job_id))
            return

    log.debug("Stolos got a job_id.",
              extra=dict(app_name=ns.app_name,
                         job_id=ns.job_id,
                         acquired_lock=bool(lock)))
    if lock is False:
        # infinite loop: some jobs will always requeue if lock is unobtainable
        log.info("Could not obtain a lock.  Will requeue and try again later",
                 extra=dict(app_name=ns.app_name, job_id=ns.job_id))
        _send_to_back_of_queue(q=q, app_name=ns.app_name, job_id=ns.job_id)
        return

    if not parents_completed(ns.app_name, ns.job_id, q=q, lock=lock):
        return

    log.info("Job starting!",
             extra=dict(app_name=ns.app_name, job_id=ns.job_id))
    try:
        ns.job_type_func(ns=ns)
    except exceptions.CodeError:  # assume error is previously logged
        _handle_failure(ns, q, lock)
        return
    except Exception as err:
        log.exception(("Job failed!  Unhandled exception in an application!"
                       " Fix ASAP because"
                       " it is unclear how to handle this failure.  %s: %s") %
                      (err.__class__.__name__, err),
                      extra=dict(app_name=ns.app_name,
                                 job_id=ns.job_id,
                                 failed=True))
        return
    _handle_success(ns, q, lock)