def run_callable(self, body): """ Given some AMQP message, import the correct Python code and run it. """ task = body['task'] uuid = body.get('uuid', '<unknown>') args = body.get('args', []) kwargs = body.get('kwargs', {}) if 'guid' in body: set_guid(body.pop('guid')) _call = TaskWorker.resolve_callable(task) if inspect.isclass(_call): # the callable is a class, e.g., RunJob; instantiate and # return its `run()` method _call = _call().run log_extra = '' logger_method = logger.debug if ('time_ack' in body) and ('time_pub' in body): time_publish = body['time_ack'] - body['time_pub'] time_waiting = time.time() - body['time_ack'] if time_waiting > 5.0 or time_publish > 5.0: # If task too a very long time to process, add this information to the log log_extra = f' took {time_publish:.4f} to ack, {time_waiting:.4f} in local dispatcher' logger_method = logger.info # don't print kwargs, they often contain launch-time secrets logger_method(f'task {uuid} starting {task}(*{args}){log_extra}') return _call(*args, **kwargs)
def write(self, preferred_queue, body): if 'guid' in body: set_guid(body['guid']) try: # when the cluster heartbeat occurs, clean up internally if isinstance(body, dict) and 'cluster_node_heartbeat' in body['task']: self.cleanup() if self.should_grow: self.up() # we don't care about "preferred queue" round robin distribution, just # find the first non-busy worker and claim it workers = self.workers[:] random.shuffle(workers) for w in workers: if not w.busy: w.put(body) break else: return super(AutoscalePool, self).write(preferred_queue, body) except Exception: for conn in connections.all(): # If the database connection has a hiccup, re-establish a new # connection conn.close_if_unusable_or_obsolete() logger.exception('failed to write inbound message')
def write(self, preferred_queue, body): if 'guid' in body: set_guid(body['guid']) try: if isinstance(body, dict) and body.get('bind_kwargs'): self.add_bind_kwargs(body) # when the cluster heartbeat occurs, clean up internally if isinstance(body, dict) and 'cluster_node_heartbeat' in body['task']: self.cleanup() if self.should_grow: self.up() # we don't care about "preferred queue" round robin distribution, just # find the first non-busy worker and claim it workers = self.workers[:] random.shuffle(workers) for w in workers: if not w.busy: w.put(body) break else: task_name = 'unknown' if isinstance(body, dict): task_name = body.get('task') logger.warn(f'Workers maxed, queuing {task_name}, load: {sum(len(w.managed_tasks) for w in self.workers)} / {len(self.workers)}') return super(AutoscalePool, self).write(preferred_queue, body) except Exception: for conn in connections.all(): # If the database connection has a hiccup, re-establish a new # connection conn.close_if_unusable_or_obsolete() logger.exception('failed to write inbound message')
def perform_job(self, job, queue): """ Set the :class:`pulpcore.app.models.Task` to running and init logging. This method is called by the worker's work horse thread (the forked child) just before the task begins executing. Args: job (rq.job.Job): The job to perform queue (rq.queue.Queue): The Queue associated with the job """ try: task = Task.objects.get(pk=job.get_id()) except Task.DoesNotExist: pass else: if task.state != TASK_STATES.WAITING: return task.set_running() user = get_users_with_perms(task).first() _set_current_user(user) set_guid(task.logging_cid) with TaskWorkingDirectory(job): return super().perform_job(job, queue)
def worker_prerun(task: Task, **kwargs) -> None: """ Called before a worker starts executing a task. Here we make sure to set the appropriate correlation ID for all logs logged during the tasks, and on the thread in general. In that regard, this does the Celery equivalent to what the django-guid middleware does for a request. """ guid = task.request.get(settings.guid_header_name) if guid: logger.info('Setting GUID %s', guid) set_guid(guid) else: generated_guid = generate_guid( uuid_length=settings.integration_settings.celery.uuid_length) logger.info('Generated GUID %s', generated_guid) set_guid(generated_guid) if settings.integration_settings.celery.log_parent: origin = task.request.get(parent_header) if origin: logger.info('Setting parent ID %s', origin) celery_parent.set(origin) generated_current_guid = generate_guid( uuid_length=settings.integration_settings.celery.uuid_length) logger.info('Generated current ID %s', generated_current_guid) celery_current.set(generated_current_guid)
async def django_guid_api_usage(request: HttpRequest) -> JsonResponse: """ Uses each API function """ logger.info('Current GUID: %s', get_guid()) set_guid('another guid') logger.info('Current GUID: %s', get_guid()) clear_guid() logger.info('Current GUID: %s', get_guid()) return JsonResponse({'detail': ':)'})
def test_cleanup(monkeypatch, mocker: MockerFixture): """ Test that cleanup works as expected """ set_guid('123') celery_current.set('123') celery_parent.set('123') mocked_settings = deepcopy(django_settings.DJANGO_GUID) mocked_settings['INTEGRATIONS'] = [CeleryIntegration(log_parent=True)] with override_settings(DJANGO_GUID=mocked_settings): settings = Settings() monkeypatch.setattr('django_guid.integrations.celery.signals.settings', settings) clean_up(task=mocker.Mock()) assert [get_guid(), celery_current.get(), celery_parent.get()] == [None, None, None]
def run_callable(self, body): """ Given some AMQP message, import the correct Python code and run it. """ task = body['task'] uuid = body.get('uuid', '<unknown>') args = body.get('args', []) kwargs = body.get('kwargs', {}) if 'guid' in body: set_guid(body.pop('guid')) _call = TaskWorker.resolve_callable(task) if inspect.isclass(_call): # the callable is a class, e.g., RunJob; instantiate and # return its `run()` method _call = _call().run # don't print kwargs, they often contain launch-time secrets logger.debug('task {} starting {}(*{})'.format(uuid, task, args)) return _call(*args, **kwargs)
def _perform_task(task_pk, task_working_dir_rel_path): """Setup the environment to handle a task and execute it. This must be called as a subprocess, while the parent holds the advisory lock.""" signal.signal(signal.SIGINT, child_signal_handler) signal.signal(signal.SIGTERM, child_signal_handler) signal.signal(signal.SIGUSR1, child_signal_handler) # All processes need to create their own postgres connection connection.connection = None task = Task.objects.get(pk=task_pk) task.set_running() # Store the task id in the environment for `Task.current()`. os.environ["PULP_TASK_ID"] = str(task.pk) user = get_users_with_perms(task).first() _set_current_user(user) set_guid(task.logging_cid) try: _logger.info("Starting task {}".format(task.pk)) # Execute task module_name, function_name = task.name.rsplit(".", 1) module = importlib.import_module(module_name) func = getattr(module, function_name) args = json.loads(task.args) or () kwargs = json.loads(task.kwargs) or {} os.chdir(task_working_dir_rel_path) result = func(*args, **kwargs) if asyncio.iscoroutine(result): _logger.debug("Task is coroutine {}".format(task.pk)) loop = asyncio.get_event_loop() loop.run_until_complete(result) except Exception: exc_type, exc, tb = sys.exc_info() task.set_failed(exc, tb) _logger.info("Task {} failed ({})".format(task.pk, exc)) _logger.info("\n".join(traceback.format_list( traceback.extract_tb(tb)))) else: task.set_completed() _logger.info("Task completed {}".format(task.pk)) os.environ.pop("PULP_TASK_ID")
def run(): ppid = os.getppid() logger.warning('periodic beat started') while True: if os.getppid() != ppid: # if the parent PID changes, this process has been orphaned # via e.g., segfault or sigkill, we should exit too pid = os.getpid() logger.warning( f'periodic beat exiting gracefully pid:{pid}') raise SystemExit() try: for conn in connections.all(): # If the database connection has a hiccup, re-establish a new # connection conn.close_if_unusable_or_obsolete() set_guid(generate_guid()) self.run_pending() except Exception: logger.exception( 'encountered an error while scheduling periodic tasks') time.sleep(idle_seconds)
def test_task_publish_includes_correct_headers(monkeypatch): """ It's important that we include the correct headers when publishing a task to the celery worker pool, otherwise there's no transfer of state. """ # Mocking overhead mocked_settings = deepcopy(django_settings.DJANGO_GUID) mocked_settings['INTEGRATIONS'] = [CeleryIntegration(log_parent=False)] with override_settings(DJANGO_GUID=mocked_settings): settings = Settings() monkeypatch.setattr('django_guid.integrations.celery.signals.settings', settings) # Actual testing for correlation_id in [None, 'test', 123, -1]: # Set the id in our context var set_guid(correlation_id) # Run signal with empty headers headers = {} publish_task_from_worker_or_request(headers=headers) # Makes sure the returned headers contain the correct result assert headers[settings.guid_header_name] == correlation_id
def perform_work(self, body): try: flush = body.get('event') == 'FLUSH' if flush: self.last_event = '' if not flush: job_identifier = 'unknown job' for cls in (JobEvent, AdHocCommandEvent, ProjectUpdateEvent, InventoryUpdateEvent, SystemJobEvent): if cls.JOB_REFERENCE in body: job_identifier = body[cls.JOB_REFERENCE] break self.last_event = f'\n\t- {cls.__name__} for #{job_identifier} ({body.get("event", "")} {body.get("uuid", "")})' # noqa notification_trigger_event = bool( body.get('event') == cls.WRAPUP_EVENT) if body.get('event') == 'EOF': try: if 'guid' in body: set_guid(body['guid']) final_counter = body.get('final_counter', 0) logger.info( 'Starting EOF event processing for Job {}'.format( job_identifier)) # EOF events are sent when stdout for the running task is # closed. don't actually persist them to the database; we # just use them to report `summary` websocket events as an # approximation for when a job is "done" emit_channel_notification( 'jobs-summary', dict(group_name='jobs', unified_job_id=job_identifier, final_counter=final_counter)) if notification_trigger_event: job_stats_wrapup(job_identifier) except Exception: logger.exception( 'Worker failed to perform EOF tasks: Job {}'. format(job_identifier)) finally: self.subsystem_metrics.inc( 'callback_receiver_events_in_memory', -1) set_guid('') return skip_websocket_message = body.pop('skip_websocket_message', False) event = cls.create_from_data(**body) if skip_websocket_message: # if this event sends websocket messages, fire them off on flush event._skip_websocket_message = True if notification_trigger_event: # if this is an Ansible stats event, ensure notifications on flush event._notification_trigger_event = True self.buff.setdefault(cls, []).append(event) retries = 0 while retries <= self.MAX_RETRIES: try: self.flush(force=flush) break except (OperationalError, InterfaceError, InternalError): if retries >= self.MAX_RETRIES: logger.exception( 'Worker could not re-establish database connectivity, giving up on one or more events.' ) return delay = 60 * retries logger.exception( 'Database Error Saving Job Event, retry #{i} in {delay} seconds:' .format(i=retries + 1, delay=delay)) django_connection.close() time.sleep(delay) retries += 1 except DatabaseError: logger.exception('Database Error Saving Job Event') break except Exception as exc: tb = traceback.format_exc() logger.error('Callback Task Processor Raised Exception: %r', exc) logger.error('Detail: {}'.format(tb))
def perform_work(self, body): try: flush = body.get('event') == 'FLUSH' if flush: self.last_event = '' if not flush: event_map = { 'job_id': JobEvent, 'ad_hoc_command_id': AdHocCommandEvent, 'project_update_id': ProjectUpdateEvent, 'inventory_update_id': InventoryUpdateEvent, 'system_job_id': SystemJobEvent, } job_identifier = 'unknown job' for key, cls in event_map.items(): if key in body: job_identifier = body[key] break self.last_event = f'\n\t- {cls.__name__} for #{job_identifier} ({body.get("event", "")} {body.get("uuid", "")})' # noqa if body.get('event') == 'EOF': try: if 'guid' in body: set_guid(body['guid']) final_counter = body.get('final_counter', 0) logger.info('Event processing is finished for Job {}, sending notifications'.format(job_identifier)) # EOF events are sent when stdout for the running task is # closed. don't actually persist them to the database; we # just use them to report `summary` websocket events as an # approximation for when a job is "done" emit_channel_notification('jobs-summary', dict(group_name='jobs', unified_job_id=job_identifier, final_counter=final_counter)) # Additionally, when we've processed all events, we should # have all the data we need to send out success/failure # notification templates uj = UnifiedJob.objects.get(pk=job_identifier) if isinstance(uj, Job): # *actual playbooks* send their success/failure # notifications in response to the playbook_on_stats # event handling code in main.models.events pass elif hasattr(uj, 'send_notification_templates'): handle_success_and_failure_notifications.apply_async([uj.id]) except Exception: logger.exception('Worker failed to emit notifications: Job {}'.format(job_identifier)) finally: self.subsystem_metrics.inc('callback_receiver_events_in_memory', -1) set_guid('') return skip_websocket_message = body.pop('skip_websocket_message', False) event = cls.create_from_data(**body) if skip_websocket_message: event._skip_websocket_message = True self.buff.setdefault(cls, []).append(event) retries = 0 while retries <= self.MAX_RETRIES: try: self.flush(force=flush) break except (OperationalError, InterfaceError, InternalError): if retries >= self.MAX_RETRIES: logger.exception('Worker could not re-establish database connectivity, giving up on one or more events.') return delay = 60 * retries logger.exception('Database Error Saving Job Event, retry #{i} in {delay} seconds:'.format(i=retries + 1, delay=delay)) django_connection.close() time.sleep(delay) retries += 1 except DatabaseError: logger.exception('Database Error Saving Job Event') break except Exception as exc: tb = traceback.format_exc() logger.error('Callback Task Processor Raised Exception: %r', exc) logger.error('Detail: {}'.format(tb))
def _queue_reserved_task(func, inner_task_id, resources, inner_args, inner_kwargs, options): """ A task that encapsulates another task to be dispatched later. This task being encapsulated is called the "inner" task, and a task name, UUID, and accepts a list of positional args and keyword args for the inner task. These arguments are named inner_args and inner_kwargs. inner_args is a list, and inner_kwargs is a dictionary passed to the inner task as positional and keyword arguments using the * and ** operators. The inner task is dispatched into a dedicated queue for a worker that is decided at dispatch time. The logic deciding which queue receives a task is controlled through the find_worker function. Args: func (basestring): The function to be called inner_task_id (basestring): The task_id to be set on the task being called. By providing the UUID, the caller can have an asynchronous reference to the inner task that will be dispatched. resources (basestring): The urls of the resource you wish to reserve for your task. The system will ensure that no other tasks that want that same reservation will run concurrently with yours. inner_args (tuple): The positional arguments to pass on to the task. inner_kwargs (dict): The keyword arguments to pass on to the task. options (dict): For all options accepted by enqueue see the RQ docs """ redis_conn = connection.get_redis_connection() task_status = Task.objects.get(pk=inner_task_id) set_guid(task_status.logging_cid) task_name = func.__module__ + "." + func.__name__ while True: if task_name == "pulpcore.app.tasks.orphan.orphan_cleanup": if ReservedResource.objects.exists(): # wait until there are no reservations time.sleep(0.25) continue else: rq_worker = util.get_current_worker() worker = Worker.objects.get(name=rq_worker.name) task_status.worker = worker task_status.set_running() q = Queue("resource-manager", connection=redis_conn, is_async=False) try: q.enqueue( func, args=inner_args, kwargs=inner_kwargs, job_id=inner_task_id, job_timeout=TASK_TIMEOUT, **options, ) task_status.set_completed() except RedisConnectionError as e: task_status.set_failed(e, None) return try: with transaction.atomic(): # lock the worker - there is a similar lock in mark_worker_offline() worker = _acquire_worker(resources) # Attempt to lock all resources by their urls. Must be atomic to prevent deadlocks. for resource in resources: if worker.reservations.select_for_update().filter( resource=resource).exists(): reservation = worker.reservations.get( resource=resource) else: reservation = ReservedResource.objects.create( worker=worker, resource=resource) TaskReservedResource.objects.create(resource=reservation, task=task_status) worker.cleaned_up = False worker.save(update_fields=["cleaned_up"]) except (Worker.DoesNotExist, IntegrityError): # if worker is ready, or we have a worker but we can't create the reservations, wait time.sleep(0.25) else: # we have a worker with the locks break task_status.worker = worker task_status.save() try: q = Queue(worker.name, connection=redis_conn) q.enqueue( func, args=inner_args, kwargs=inner_kwargs, job_id=inner_task_id, job_timeout=TASK_TIMEOUT, **options, ) except RedisConnectionError as e: task_status.set_failed(e, None)