def clean_up_stuck_tasks(): if not settings.TASK_TIMEOUT: return # Celery should clean up tasks automatically so add a buffer to let that happen. task_timeout = settings.TASK_TIMEOUT + 120 client, app_name = get_scale_client() time_threshold = datetime.datetime.now( timezone.utc) - datetime.timedelta(seconds=task_timeout) export_task_records = (ExportTaskRecord.objects.prefetch_related( "export_provider_task__tasks").select_related( "export_provider_task__run").filter( Q(status=TaskState.RUNNING.value) & Q(started_at__lt=time_threshold))) run_uids = [] for export_task_record in export_task_records: run = export_task_record.export_provider_task.run run_uids.append(str(run.uid)) # Cancel the export task records that are over the timeout export_task_record.status = TaskState.CANCELED.value export_task_record.save() # Update DPTR to pending so that it can get picked up again data_provider_task_record = export_task_record.export_provider_task data_provider_task_record.status = TaskState.PENDING.value data_provider_task_record.save() # Update run to submitted so that it can get picked up again. run.status = TaskState.SUBMITTED.value run.save() kill_workers(run_uids, client)
def scale_by_tasks(celery_tasks, max_tasks_memory): client, app_name = get_scale_client() broker_api_url = getattr(settings, "BROKER_API_URL") queue_class = "queues" celery_pcf_task_details = get_celery_task_details(client, app_name, celery_tasks) logger.info( f"Running Tasks Memory used: {celery_pcf_task_details['memory']} MB") celery_tasks = order_celery_tasks(celery_tasks, celery_pcf_task_details["task_counts"]) # we don't want to exceed our memory but we also don't want to prevent tasks that _can_ run from running. smallest_memory_required = int( min([v["memory"] for k, v in celery_tasks.items()])) or 0 logger.info(f"smallest_memory_required: {smallest_memory_required}") logger.info(f"max_tasks_memory: {max_tasks_memory}") running_tasks_memory = celery_pcf_task_details["memory"] while running_tasks_memory + smallest_memory_required <= max_tasks_memory: queues = get_all_rabbitmq_objects(broker_api_url, queue_class) dicts = list_to_dict(queues, "name") # If no tasks were run, give up... otherwise try to run another task. has_run_task = False running_tasks = client.get_running_tasks(app_name) if not any( [queue.get("messages", 0) for queue_name, queue in dicts.items()]): running_task_names = [] for running_task in running_tasks.get("resources"): running_task_name = running_task.get("name") running_task_names.append(running_task_name) logger.info( f"No messages left in the queue, shutting down {running_task_name}." ) kill_workers(task_names=running_task_names, client=client) break queues_to_kill = [] for celery_task_name, celery_task in celery_tasks.items(): queue = dicts.get(celery_task_name) if not queue: continue queue_name = queue.get("name") pending_messages = queue.get("messages", 0) if pending_messages: logger.info( f"Queue {queue_name} has {pending_messages} pending messages." ) # Get updated information... running_tasks_by_queue = client.get_running_tasks( app_name, queue_name) running_tasks_by_queue_count = running_tasks_by_queue[ "pagination"].get("total_results", 0) if pending_messages > running_tasks_by_queue_count: # Allow queues to have a limit, so that we don't spin up 30 priority queues. limit = celery_task.get("limit") if limit: if running_tasks_by_queue_count >= limit: continue if running_tasks_memory + celery_tasks[queue_name][ "memory"] <= max_tasks_memory: run_task_command(client, app_name, queue_name, celery_tasks[queue_name]) elif running_tasks_by_queue_count and not pending_messages: logger.info( f"The {queue_name} has no messages, but has running_tasks_by_queue_count. Scheduling shutdown..." ) queues_to_kill.append(queue_name) else: if running_tasks_by_queue_count: logger.info( f"Already {running_tasks_by_queue_count} workers, processing {pending_messages} total pending " f"messages left in {queue_name} queue.") running_tasks_memory = client.get_running_tasks_memory(app_name) kill_workers(queues_to_kill, client) if not has_run_task: break
def scale_by_runs(max_tasks_memory): """ @param max_tasks_memory: The amount of memory in MB to allow for all of the tasks. @type max_tasks_memory: int """ from audit_logging.utils import get_user_details client, app_name = get_scale_client() celery_task_details = get_celery_task_details(client, app_name) running_tasks_memory = int(celery_task_details["memory"]) celery_tasks = get_celery_tasks_scale_by_run() # Check if we need to scale for default system tasks. scale_default_tasks(client, app_name, celery_tasks) # Get run in progress runs = ExportRun.objects.filter(status=TaskState.SUBMITTED.value, deleted=False) total_tasks = 0 running_tasks = client.get_running_tasks(app_name) logger.info(f"Running tasks: {running_tasks}") if running_tasks: total_tasks = running_tasks["pagination"].get("total_results", 0) # Get a list of running task names excluding the default celery tasks. running_task_names = [ resource.get("name") for resource in running_tasks.get("resources") if resource.get("name") != "celery" ] finished_runs = ExportRun.objects.filter( Q(uid__in=running_task_names) & (Q(status__in=[ state.value for state in TaskState.get_finished_states() ]) | Q(deleted=True))) finished_run_uids = [] for finished_run in finished_runs: logger.info( f"Stopping {finished_run.uid} because it is in a finished state ({finished_run.status}) " f"or was deleted ({finished_run.deleted}).") finished_run_uids.append(str(finished_run.uid)) kill_workers(task_names=finished_run_uids, client=client) for run in runs: celery_run_task = copy.deepcopy(celery_tasks["run"]) logger.info( f"Checking to see if submitted run {run.uid} needs a new worker.") max_runs = int(os.getenv("RUNS_CONCURRENCY", 3)) if max_runs and total_tasks >= max_runs: logger.info( f"total_tasks ({total_tasks}) >= max_runs ({max_runs})") break if running_tasks_memory + celery_run_task["memory"] >= max_tasks_memory: logger.info("Not enough available memory to scale another run.") break task_name = run.uid running_tasks_by_queue = client.get_running_tasks(app_name, task_name) running_tasks_by_queue_count = running_tasks_by_queue[ "pagination"].get("total_results", 0) logger.info( f"Currently {running_tasks_by_queue_count} tasks running for {task_name}." ) if running_tasks_by_queue_count: logger.info(f"Already a consumer for {task_name}") continue user_session = UserSession.objects.filter(user=run.user).last() session_token = None if user_session: session = Session.objects.get(session_key=user_session.session_id) session_token = session.get_decoded().get("session_token") user_details = get_user_details(run.user) pick_up_run_task.s(run_uid=str(run.uid), session_token=session_token, user_details=user_details).apply_async( queue=str(task_name), routing_key=str(task_name)) celery_run_task["command"] = celery_run_task["command"].format( celery_group_name=task_name) run_task_command(client, app_name, str(task_name), celery_run_task) # Keep track of new resources being used. total_tasks += 1 running_tasks_memory += celery_run_task["memory"]