def test_fail(self): retry_delay = 1 with self.assertRaises(AssertionError): start = time.time() handle_broker_timeout(fail, retry_delay=retry_delay) delta = time.time() - start # Make sure we didn't do sleep or retry self.assertLess(delta, retry_delay)
def update_task_name(sender, task_id, *_args, **_kwargs): # Although the name was populated in populate_task_info before_task_publish, the name # can be inaccurate if it was a plugin. We can only over-write it with the accurate name # at task_prerun. callable_func = current_app.backend.client.hset args = (task_id, 'name', sender.name) handle_broker_timeout(callable_func=callable_func, args=args, timeout=5 * 60, reraise_on_timeout=False)
def test_success_not_reached_due_to_timeout(self): retry_delay = 0.1 succeed_after_retries = 5 timeout = 2 * retry_delay obj = SucceedAfter(succeed_after_retries) start = time.time() with self.assertRaises(TimeoutError): handle_broker_timeout(obj.foo, retry_delay=retry_delay, timeout=timeout) delta = time.time() - start self.assertGreater(delta, timeout) self.assertLess(delta, timeout + retry_delay)
def test_fail_with_timeout(self): timeout = 0.5 retry_delay = 0.1 with self.assertRaises(TimeoutError): start = time.time() handle_broker_timeout(fail_with_timeout, retry_delay=0.1, timeout=timeout) delta = time.time() - start self.assertGreater(delta, timeout) self.assertLess( delta, timeout + (retry_delay * 10)) # handle_broker_timeout now uses exponential retry delay
def is_result_ready(result: AsyncResult, timeout=15 * 60, retry_delay=1): """ Protect against broker being temporary unreachable and throwing a TimeoutError """ return handle_broker_timeout(result.ready, timeout=timeout, retry_delay=retry_delay)
def get_task_info_from_result(result, key: str = None): try: backend = result.app.backend except AttributeError: backend = current_app.backend if key is not None: info = handle_broker_timeout(backend.client.hget, args=(str(result), key)) else: info = handle_broker_timeout(backend.client.get, args=(str(result), )) if info is None: info = '' else: info = info.decode() return info
def _check_for_failure_in_parents(result, timeout=15 * 60, retry_delay=1): failed_parent = revoked_parent = None parent = handle_broker_timeout(getattr, args=(result, 'parent'), timeout=timeout, retry_delay=retry_delay) while parent and parent != result: state = handle_broker_timeout(getattr, args=(parent, 'state'), timeout=timeout, retry_delay=retry_delay) if state == FAILURE: failed_parent = parent break if state == REVOKED or RevokedRequests.instance().is_revoked(parent): revoked_parent = parent break result = parent parent = handle_broker_timeout(getattr, args=(parent, 'parent'), timeout=timeout, retry_delay=retry_delay) else: return # <-- loop finished with no errors in parents if revoked_parent: raise ChainRevokedException( task_id=str(revoked_parent), task_name=get_task_name_from_result(revoked_parent)) # If we get here, failed_parent holds a failed parent parent = handle_broker_timeout(getattr, args=(failed_parent, 'parent'), timeout=timeout, retry_delay=retry_delay) while parent and parent != failed_parent: # Find first failed parent, now that celery propagates parent failures parent_failed = handle_broker_timeout(parent.failed, timeout=timeout, retry_delay=retry_delay) if not parent_failed: break failed_parent = parent parent = handle_broker_timeout(getattr, args=(parent, 'parent'), timeout=timeout, retry_delay=retry_delay) cause = handle_broker_timeout(getattr, args=(failed_parent, 'result'), timeout=timeout, retry_delay=retry_delay) cause = cause if isinstance(cause, Exception) else None raise ChainInterruptedException( task_id=str(failed_parent), task_name=get_task_name_from_result(failed_parent), cause=cause)
def test_succeed_after_retries(self): retry_delay = 0.1 succeed_after_retries = 3 obj = SucceedAfter(succeed_after_retries) start = time.time() self.assertIsNone( handle_broker_timeout(obj.foo, retry_delay=retry_delay)) delta = time.time() - start self.assertGreater(delta, succeed_after_retries * retry_delay) self.assertLess(delta, (succeed_after_retries + 1) * retry_delay)
def wait_on_async_results(results, max_wait=None, callbacks: Iterator[WaitLoopCallBack] = tuple(), sleep_between_iterations=0.05, check_task_worker_frequency=600, fail_on_worker_failures=7, log_msg=True, **_kwargs): if not results: return if isinstance(results, AsyncResult): results = [results] max_sleep = sleep_between_iterations * 20 * 15 # Somewhat arbitrary failures = [] start_time = time.monotonic() last_callback_time = {callback.func: start_time for callback in callbacks} for result in results: logging_name = get_result_logging_name(result) if log_msg: logger.debug('-> Waiting for %s to complete' % logging_name) try: task_worker_failures = 0 last_dead_task_worker_check = time.monotonic() while not is_result_ready(result): if RevokedRequests.instance().is_revoked(result): break _check_for_failure_in_parents(result) current_time = time.monotonic() if max_wait and (current_time - start_time) > max_wait: logging_name = get_result_logging_name(result) raise WaitOnChainTimeoutError( 'Result ID %s was not ready in %d seconds' % (logging_name, max_wait)) # callbacks for callback in callbacks: if (current_time - last_callback_time[callback.func] ) > callback.frequency: callback.func(**callback.kwargs) last_callback_time[callback.func] = current_time # Check for dead workers if check_task_worker_frequency and fail_on_worker_failures and \ (current_time - last_dead_task_worker_check) > check_task_worker_frequency: alive = _is_worker_alive(result=result) last_dead_task_worker_check = current_time if not alive: task_worker_failures += 1 logger.warning( f'Task {get_task_name_from_result(result)} appears to be a zombie.' f' Failures: {task_worker_failures}') if task_worker_failures >= fail_on_worker_failures: task_id = str(result) task_name = get_task_name_from_result(result) raise ChainInterruptedByZombieTaskException( task_id=task_id, task_name=task_name) else: task_worker_failures = 0 time.sleep(sleep_between_iterations) sleep_between_iterations = sleep_between_iterations * 1.01 \ if sleep_between_iterations*1.01 < max_sleep else max_sleep # Exponential backoff # If failure happened in a chain, raise from the failing task within the chain _check_for_failure_in_parents(result) result_state = handle_broker_timeout(getattr, args=(result, 'state')) if result_state == REVOKED: # wait for revoked tasks to actually finish running wait_for_running_tasks_from_results([result]) raise ChainRevokedException( task_id=str(result), task_name=get_task_name_from_result(result)) if result_state == PENDING: # Pending tasks can be in revoke list. State will still be PENDING. raise ChainRevokedPreRunException( task_id=str(result), task_name=get_task_name_from_result(result)) if result_state == FAILURE: cause = result.result if isinstance(result.result, Exception) else None raise ChainInterruptedException( task_id=str(result), task_name=get_task_name_from_result(result), cause=cause) except (ChainRevokedException, ChainInterruptedException) as e: failures.append(e) if len(failures) == 1: raise failures[0] elif failures: failed_task_ids = [ e.task_id for e in failures if hasattr(e, 'task_id') ] multi_exception = MultipleFailuresException(failed_task_ids) multi_exception.failures = failures raise multi_exception
def _is_worker_alive(result: AsyncResult, retries=1): task_name = get_result_logging_name(result) tries = 0 # NOTE: Retries for possible false negative in the case where task changes host in the small timing window # between getting task state / info and checking for aliveness. Retries for broker issues are handled downstream while tries <= retries: state = handle_broker_timeout(lambda r: r.state, args=(result, )) if not state: logger.debug( f'Cannot get state for {task_name}; assuming task is alive') return True if state == STARTED or state == RECEIVED: # Query the worker to see if it knows about this task info = handle_broker_timeout(lambda r: r.info, args=(result, )) try: # NOTE: if the task completes after the check for state right above but before the call # to handle_broker_timeout(), the type of 'info' is whatever the task returned, not the internal # Celery dictionary we want. It can be an exception, or even a dictionary with a random 'hostname'. # In the latter case _is_worker_alive() will return False, but since we retry _is_worker_alive() that # should be fine -- this timing issue cannot happen twice for the same task. hostname = info.get('hostname') except AttributeError: hostname = None if not hostname: logger.debug( f'Cannot get run info for {task_name}; assuming task is alive.' f' Info: {info}, Hostname: {hostname}') return True task_id = result.id task_info = get_task(method_args=(task_id, ), destination=(hostname, ), timeout=60) if task_info and any(task_info.values()): return True # Try get_active and get_reserved, since we suspect query_task (the api used by get_task above) # may be broken sometimes. active_tasks = get_active(destination=(hostname, ), timeout=60) task_list = active_tasks.get(hostname) if active_tasks else None if task_list: for task in task_list: this_task_id = task.get('id') if this_task_id == task_id: return True reserved_tasks = get_reserved(destination=(hostname, ), timeout=60) task_list = reserved_tasks.get( hostname) if reserved_tasks else None if task_list: for task in task_list: this_task_id = task.get('id') if this_task_id == task_id: return True logger.debug( f'Task inspection for {task_name} on {hostname} with id ' f'of {task_id} returned:\n{pformat(task_info)}\n' f'Active tasks:\n{pformat(active_tasks)}\n' f'Reserved tasks:\n{pformat(reserved_tasks)}') elif state == PENDING or state == RETRY: # Check if task queue is alive task_queue = get_task_queue_from_result(result) if not task_queue: logger.debug( f'Cannot get task queue for {task_name}; assuming task is alive.' ) return True queue_seen = was_queue_ready(queue_name=task_queue) if not queue_seen: logger.debug( f'Queue "{task_queue}" for {task_name} not seen yet; assuming task is alive.' ) return True queues = get_active_queues(timeout=60) active_queues = { queue['name'] for node in queues.values() for queue in node } if queues else set() if task_queue in active_queues: return True logger.debug( f'Active queues inspection for {task_name} on queue {task_queue} returned:\n' f'{pformat(queues)}\n' f'Active queues: {pformat(active_queues)}') elif state == SUCCESS: return True # Timing; possible if task state changed after we waited on it but before we got here else: logger.debug( f'Unknown state ({state} for task {task_name}; assuming task is alive.' ) return True tries += 1 logger.info( f'Task {task_name} is not responding to queries. Tries: {tries}') return False
def test_passing_callable_with_args_and_kwargs(self): self.assertEqual( handle_broker_timeout(bar, args=(1, ), kwargs={'b': 2}), 3)
def test_passing_callable_with_kwargs(self): self.assertEqual(handle_broker_timeout(bar, kwargs={'a': 1}), 2)
def test_passing_callable_with_args(self): self.assertEqual(handle_broker_timeout(bar, args=(1, )), 2)
def test_passing_callable(self): self.assertEqual(handle_broker_timeout(foo), 1)