def _delete_cloud_accounts(cloud_accounts): """ Delete the given list of CloudAccount objects. Args: cloud_accounts (list[CloudAccount]): cloud accounts to delete """ for cloud_account in cloud_accounts: # Lock on the user level, so that a single user can only have one task # running at a time. # # The select_for_update() lock has been moved from the CloudAccount to the # UserTaskLock. We should release the UserTaskLock with each # cloud_account.delete action. # # Using the UserTaskLock *should* fix the issue of Django not getting a # row-level lock in the DB for each CloudAccount we want to delete until # after all of the pre_delete logic completes with lock_task_for_user_ids([cloud_account.user.id]): # Call delete on the CloudAccount queryset instead of the specific # cloud_account. Why? A queryset delete does not raise DoesNotExist # exceptions if the cloud_account has already been deleted. # If we call delete on a nonexistent cloud_account, we run into trouble # with Django rollback and our task lock. # See https://gitlab.com/cloudigrade/cloudigrade/-/merge_requests/811 try: cloud_account.refresh_from_db() CloudAccount.objects.filter(id=cloud_account.id).delete() except CloudAccount.DoesNotExist: logger.info(_("Cloud Account %s has already been deleted"), cloud_account)
def test_lock_task_for_user_ids_updates_usertasklock(self): """Assert UserTaskLock is updated by context manager.""" user = util_helper.generate_test_user() UserTaskLock.objects.create(user=user) with lock_task_for_user_ids([user.id]) as locks: for lock in locks: self.assertEqual(lock.locked, True) lock = UserTaskLock.objects.get(user=user) self.assertEqual(lock.locked, False)
def test_lock_task_for_user_ids_create_usertasklock(self): """Assert UserTaskLock is created by context manager.""" user1 = util_helper.generate_test_user() user2 = util_helper.generate_test_user() with lock_task_for_user_ids([user1.id, user2.id]): locks = UserTaskLock.objects.all() for lock in locks: self.assertEqual(lock.locked, True) locks = UserTaskLock.objects.all() for lock in locks: self.assertEqual(lock.locked, False)
def save(self, *args, **kwargs): """Save this image and delete any related ConcurrentUsage objects.""" concurrent_usages = ConcurrentUsage.objects.filter( potentially_related_runs__in=Run.objects.filter(machineimage=self) ) if concurrent_usages.exists(): # Lock all users that depend on this machineimage user_ids = set( Instance.objects.filter(machine_image=self).values_list( "cloud_account__user__id", flat=True ) ) with lock_task_for_user_ids(user_ids): concurrent_usages.delete() return super().save(*args, **kwargs)
def test_delete_clount_lock_exception(self, mock_notify_sources): """Test that an exception when deleting a clount inside a lock rolls back.""" aws_account_id = util_helper.generate_dummy_aws_account_id() arn = util_helper.generate_dummy_arn(account_id=aws_account_id) account = api_helper.generate_cloud_account( arn=arn, aws_account_id=aws_account_id, name="test", generate_verify_task=False, ) UserTaskLock.objects.create(user=account.user) with self.assertRaises(transaction.TransactionManagementError): with lock_task_for_user_ids([account.user.id]): CloudAccount.objects.filter(id=account.id).delete() raise transaction.TransactionManagementError self.assertEqual(CloudAccount.objects.all().count(), 1) self.assertFalse(UserTaskLock.objects.get(user=account.user).locked)
def test_delete_clount_lock(self, mock_notify_sources, mock_delete_cloudtrail): """Test that deleting an clount inside a lock is successful.""" aws_account_id = util_helper.generate_dummy_aws_account_id() arn = util_helper.generate_dummy_arn(account_id=aws_account_id) account = api_helper.generate_cloud_account( arn=arn, aws_account_id=aws_account_id, name="test", generate_verify_task=False, ) with lock_task_for_user_ids([account.user.id]): CloudAccount.objects.filter(id=account.id).delete() self.assertEqual(CloudAccount.objects.all().count(), 0) self.assertFalse( UserTaskLock.objects.filter(user=account.user).exists())
def save(self, *args, **kwargs): """Save this image and delete any related ConcurrentUsage objects.""" concurrent_usages = ConcurrentUsage.objects.filter( potentially_related_runs__in=Run.objects.filter(machineimage=self)) if concurrent_usages.exists(): # Lock all users that depend on this machineimage user_ids = set( Instance.objects.filter(machine_image=self).values_list( "cloud_account__user__id", flat=True)) with lock_task_for_user_ids(user_ids): logger.info( "Removing %(num_usages)d related ConcurrentUsage objects " "related to Run %(run)s.", { "num_usages": concurrent_usages.count(), "run": str(self) }, ) concurrent_usages.delete() return super().save(*args, **kwargs)
def _save_cloudtrail_activity(instance_events, ami_tag_events, described_instances, described_images): """ Save new images and instances events found via CloudTrail to the DB. The order of operations here generally looks like: 1. Save new images. 2. Save tag changes for images. 3. Save new instances. 4. Save events for instances. Note: Nothing should be reaching out to AWS APIs in this function! We should have all the necessary information already, and this function saves all of it atomically in a single transaction. Args: instance_events (list[CloudTrailInstanceEvent]): found instance events ami_tag_events (list[CloudTrailImageTagEvent]): found ami tag events described_instances (dict): described new-to-us AWS instances keyed by EC2 instance ID described_images (dict): described new-to-us AMIs keyed by AMI ID Returns: dict: Only the new images that were created in the process. """ # Log some basic information about what we're saving. log_prefix = "analyzer" # Lock all user accounts related to the instance events being processed. # A user can only run one task at a time. all_user_ids = set([ AwsCloudAccount.objects.get(aws_account_id=instance_event. aws_account_id).cloud_account.get().user.id for instance_event in instance_events ] + [ AwsCloudAccount.objects.get(aws_account_id=ami_tag_event.aws_account_id ).cloud_account.get().user.id for ami_tag_event in ami_tag_events ]) with lock_task_for_user_ids(all_user_ids): # Save instances and their events. all_ec2_instance_ids = set([ instance_event.ec2_instance_id for instance_event in instance_events if instance_event.ec2_instance_id is not None ]) logger.info( _("%(prefix)s: EC2 Instance IDs found: %(all_ec2_instance_ids)s"), { "prefix": log_prefix, "all_ec2_instance_ids": all_ec2_instance_ids }, ) all_ami_ids = set([ instance_event.ec2_ami_id for instance_event in instance_events if instance_event.ec2_ami_id is not None ] + [ ami_tag_event.ec2_ami_id for ami_tag_event in ami_tag_events if ami_tag_event.ec2_ami_id is not None ] + [ec2_ami_id for ec2_ami_id in described_images.keys()]) logger.info( _("%(prefix)s: EC2 AMI IDs found: %(all_ami_ids)s"), { "prefix": log_prefix, "all_ami_ids": all_ami_ids }, ) # Which images have the Windows platform? windows_ami_ids = { ami_id for ami_id, described_ami in described_images.items() if is_windows(described_ami) } logger.info( _("%(prefix)s: Windows AMI IDs found: %(windows_ami_ids)s"), { "prefix": log_prefix, "windows_ami_ids": windows_ami_ids }, ) # Which images need tag state changes? ocp_tagged_ami_ids, ocp_untagged_ami_ids = _extract_ami_ids_by_tag_change( ami_tag_events, OPENSHIFT_TAG) logger.info( _("%(prefix)s: AMIs found tagged for OCP: %(ocp_tagged_ami_ids)s"), { "prefix": log_prefix, "ocp_tagged_ami_ids": ocp_tagged_ami_ids }, ) logger.info( _("%(prefix)s: AMIs found untagged for OCP: %(ocp_untagged_ami_ids)s" ), { "prefix": log_prefix, "ocp_untagged_ami_ids": ocp_untagged_ami_ids }, ) rhel_tagged_ami_ids, rhel_untagged_ami_ids = _extract_ami_ids_by_tag_change( ami_tag_events, RHEL_TAG) logger.info( _("%(prefix)s: AMIs found tagged for RHEL: %(rhel_tagged_ami_ids)s" ), { "prefix": log_prefix, "rhel_tagged_ami_ids": rhel_tagged_ami_ids }, ) logger.info( _("%(prefix)s: AMIs found untagged for RHEL: %(rhel_untagged_ami_ids)s" ), { "prefix": log_prefix, "rhel_untagged_ami_ids": rhel_untagged_ami_ids }, ) # Create only the new images. new_images = {} for ami_id, described_image in described_images.items(): owner_id = Decimal(described_image["OwnerId"]) name = described_image["Name"] architecture = described_image.get("Architecture") windows = ami_id in windows_ami_ids rhel_detected_by_tag = ami_id in rhel_tagged_ami_ids openshift_detected = ami_id in ocp_tagged_ami_ids region = described_image["found_in_region"] logger.info( _("%(prefix)s: Saving new AMI ID %(ami_id)s in region %(region)s" ), { "prefix": log_prefix, "ami_id": ami_id, "region": region }, ) awsimage, new = save_new_aws_machine_image( ami_id, name, owner_id, rhel_detected_by_tag, openshift_detected, windows, region, architecture, ) image = awsimage.machine_image.get() if new and image.status is not image.INSPECTED: new_images[ami_id] = awsimage # Create "unavailable" images for AMIs we saw referenced but that we either # don't have in our models or could not describe from AWS. seen_ami_ids = set([ described_instance["ImageId"] for described_instance in described_instances.values() if described_instance.get("ImageId") is not None ] + [ ami_tag_event.ec2_ami_id for ami_tag_event in ami_tag_events if ami_tag_event.ec2_ami_id is not None ] + [ instance_event.ec2_ami_id for instance_event in instance_events if instance_event.ec2_ami_id is not None ]) described_ami_ids = set(described_images.keys()) known_ami_ids = set( image.ec2_ami_id for image in AwsMachineImage.objects.filter( ec2_ami_id__in=list(seen_ami_ids - described_ami_ids))) unavailable_ami_ids = seen_ami_ids - described_ami_ids - known_ami_ids for ami_id in unavailable_ami_ids: logger.info( _("Missing image data for %s; creating UNAVAILABLE stub image." ), ami_id) with transaction.atomic(): awsmachineimage = AwsMachineImage.objects.create( ec2_ami_id=ami_id) MachineImage.objects.create(status=MachineImage.UNAVAILABLE, content_object=awsmachineimage) awsmachineimage.machine_image.get() # Update images with openshift tag changes. if ocp_tagged_ami_ids: MachineImage.objects.filter( aws_machine_image__ec2_ami_id__in=ocp_tagged_ami_ids).update( openshift_detected=True) if ocp_untagged_ami_ids: MachineImage.objects.filter( aws_machine_image__ec2_ami_id__in=ocp_untagged_ami_ids).update( openshift_detected=False) # Update images with RHEL tag changes. if rhel_tagged_ami_ids: MachineImage.objects.filter( aws_machine_image__ec2_ami_id__in=rhel_tagged_ami_ids).update( rhel_detected_by_tag=True) if rhel_untagged_ami_ids: MachineImage.objects.filter( aws_machine_image__ec2_ami_id__in=rhel_untagged_ami_ids ).update(rhel_detected_by_tag=False) # Save instances and their events. for ((ec2_instance_id, region, aws_account_id), events) in itertools.groupby( instance_events, key=lambda e: (e.ec2_instance_id, e.region, e.aws_account_id), ): events = list(events) if ec2_instance_id in described_instances: instance_data = described_instances[ec2_instance_id] else: instance_data = { "InstanceId": ec2_instance_id, "ImageId": events[0].ec2_ami_id, "SubnetId": events[0].subnet_id, } logger.info( _("%(prefix)s: Saving new EC2 instance ID %(ec2_instance_id)s " "for AWS account ID %(aws_account_id)s in region %(region)s" ), { "prefix": log_prefix, "ec2_instance_id": ec2_instance_id, "aws_account_id": aws_account_id, "region": region, }, ) awsaccount = AwsCloudAccount.objects.get( aws_account_id=aws_account_id) account = awsaccount.cloud_account.get() instance = save_instance(account, instance_data, region) # Build a list of event data events_info = _build_events_info_for_saving( account, instance, events) save_instance_events(instance, instance_data, events_info) return new_images
def initial_aws_describe_instances(account_id): """ Fetch and save instances data found upon AWS cloud account creation. Args: account_id (int): the AwsAccount id """ try: aws_account = AwsCloudAccount.objects.get(pk=account_id) except AwsCloudAccount.DoesNotExist: logger.warning( _("AwsCloudAccount id %s could not be found for initial describe"), account_id, ) # This can happen if a customer creates and then quickly deletes their # cloud account before this async task has started to run. Early exit! return account = aws_account.cloud_account.get() if not account.is_enabled: logger.warning( _("AwsCloudAccount id %s is not enabled; skipping initial describe" ), account_id, ) # This can happen if a customer creates and then quickly disabled their # cloud account before this async task has started to run. Early exit! return arn = aws_account.account_arn session = aws.get_session(arn) instances_data = aws.describe_instances_everywhere(session) try: user_id = account.user.id except User.DoesNotExist: logger.info( _("User for account id %s has already been deleted; " "skipping initial describe."), account_id, ) # This can happen if a customer creates and then quickly deletes their # cloud account before this async task has started to run. If the user has # no other cloud accounts the user will also be deleted. Early exit! return # Lock the task at a user level. A user can only run one task at a time. with lock_task_for_user_ids([user_id]): try: # Explicitly "get" the related AwsCloudAccount before proceeding. # We do this at the start of this transaction in case the account has been # deleted during the potentially slow describe_instances_everywhere above. # If this fails, we'll jump to the except block to log an important warning. AwsCloudAccount.objects.get(pk=account_id) create_missing_power_off_aws_instance_events( account, instances_data) new_ami_ids = create_new_machine_images(session, instances_data) logger.info( _("Created new machine images include: %(new_ami_ids)s"), {"new_ami_ids": new_ami_ids}, ) create_initial_aws_instance_events(account, instances_data) except AwsCloudAccount.DoesNotExist: logger.warning( _("AwsCloudAccount id %s could not be found to save newly " "discovered images and instances"), account_id, ) # This can happen if a customer deleted their cloud account between # the start of this function and here. The AWS calls for # describe_instances_everywhere may be slow and are not within this # transaction. That's why we have to check again after it. return messages = generate_aws_ami_messages(instances_data, new_ami_ids) for message in messages: start_image_inspection(str(arn), message["image_id"], message["region"])
def calculate_max_concurrent_usage_task(self, date, user_id): # noqa: C901 """ Schedule a task to calculate maximum concurrent usage of RHEL instances. Args: self (celery.Task): The bound task. With this we can retry if necessary. date (str): the day during which we are measuring usage. Celery serializes the date as a string in the format "%Y-%B-%dT%H:%M:%S. user_id (int): required filter on user Returns: ConcurrentUsage for the given date and user ID. """ task_id = self.request.id date = date_parser.parse(date).date() # Temporary logger.info to help diagnose retry issues. logger.info( "retries is %(retries)s for id %(id)s user_id %(user_id)s and date %(date)s.", { "retries": self.request.retries, "id": task_id, "user_id": user_id, "date": date, }, ) # If the user does not exist, all the related ConcurrentUsage # objects should also have been removed, so we can exit early. if not User.objects.filter(id=user_id).exists(): return try: # Lock the task at a user level. A user can only run one task at a time. # Since this both starts a transaction and blocks any others from starting, we # can be reasonably confident that there are no other tasks processing for the # same user and date at the same time. with lock_task_for_user_ids([user_id]): try: calculation_task = ConcurrentUsageCalculationTask.objects.get( task_id=task_id) except ConcurrentUsageCalculationTask.DoesNotExist: # It's possible but unlikely this task was deleted since its task was # delayed. Since the same user still exists, try scheduling a new task. logger.warning( "ConcurrentUsageCalculationTask not found for task ID %(task_id)s! " "Scheduling a new task for user_id %(user_id)s and date %(date)s.", { "task_id": task_id, "user_id": user_id, "date": date }, ) schedule_concurrent_calculation_task(date, user_id) return if calculation_task.status != ConcurrentUsageCalculationTask.SCHEDULED: # It's possible but unlikely that something else has changed the status # of this task. If it's not currently SCHEDULED, log and return early. logger.info( "ConcurrentUsageCalculationTask for task ID %(task_id)s for " "user_id %(user_id)s and date %(date)s has status " "%(status)s which is not SCHEDULED.", { "user_id": user_id, "date": date, "task_id": task_id, "status": calculation_task.status, }, ) return calculate_max_concurrent_usage(date, user_id) calculation_task.status = ConcurrentUsageCalculationTask.COMPLETE calculation_task.save() logger.info( "Completed calculate_max_concurrent_usage_task for user_id %(user_id)s " "and date %(date)s (task_id %(task_id)s).", { "user_id": user_id, "date": date, "task_id": task_id }, ) return except Exception as unknown_exception: # It's unclear exactly what other exceptions might arise, but just to be safe, # let's log the trace, set the task's status to ERROR, and re-raise it. logger.warning(unknown_exception, exc_info=True) # Use this objects.filter().update() pattern so that we don't risk raising an # IntegrityError in case the object has somehow been deleted. ConcurrentUsageCalculationTask.objects.filter(task_id=task_id).update( status=ConcurrentUsageCalculationTask.ERROR) raise unknown_exception
def calculate_max_concurrent_usage_task(self, date, user_id): """ Schedule a task to calculate maximum concurrent usage of RHEL instances. Args: self (celery.Task): The bound task. With this we can retry if necessary. date (str): the day during which we are measuring usage. Celery serializes the date as a string in the format "%Y-%B-%dT%H:%M:%S. user_id (int): required filter on user Returns: ConcurrentUsage for the given date and user ID. """ # Temporary logger.info to help diagnose retry issues. logger.info( "retries is %(retries)s for id %(id)s user_id %(user_id)s and date %(date)s.", { "retries": self.request.retries, "id": self.request.id, "user_id": user_id, "date": date, }, ) # If the user does not exist, all the related ConcurrentUsage # objects should also have been removed, so we can exit early. if not User.objects.filter(id=user_id).exists(): return date = date_parser.parse(date).date() # If there is already an calculate_max_concurrent_usage running for given # user and date, then retry this task later. running_tasks = ConcurrentUsageCalculationTask.objects.filter( date=date, user__id=user_id, status=ConcurrentUsageCalculationTask.RUNNING) if running_tasks: logger.info( "calculate_max_concurrent_usage_task for user_id %(user_id)s " "and date %(date)s is already running. The current task will " "be retried later.", { "user_id": user_id, "date": date }, ) for task in running_tasks: logger.info("already running task %(task)s", {"task": task}) self.retry() logger.info( "Running calculate_max_concurrent_usage_task for user_id %(user_id)s " "and date %(date)s.", { "user_id": user_id, "date": date }, ) # Set task to running task_id = self.request.id try: calculation_task = ConcurrentUsageCalculationTask.objects.get( task_id=task_id) except ConcurrentUsageCalculationTask.DoesNotExist: # This probably shouldn't happen, but this error that suggest it does: # https://sentry.io/organizations/cloudigrade/issues/2299804963/ # Until we can figure out the root cause of tasks going missing, let's log an # error here with details and schedule a new calculation task. logger.error( 'ConcurrentUsageCalculationTask not found for task ID "%(task_id)s"! ' "Scheduling a new task for user_id %(user_id)s and date %(date)s.", { "task_id": task_id, "user_id": user_id, "date": date }, ) schedule_concurrent_calculation_task(date, user_id) return calculation_task.status = ConcurrentUsageCalculationTask.RUNNING calculation_task.save() try: # Lock the task at a user level. A user can only run one task at a time. # If another user task is already running, then don't start the # concurrent usage calculation task with lock_task_for_user_ids([user_id]): calculate_max_concurrent_usage(date, user_id) except Exception: calculation_task.status = ConcurrentUsageCalculationTask.ERROR calculation_task.save() raise calculation_task.status = ConcurrentUsageCalculationTask.COMPLETE calculation_task.save() logger.info( "Completed calculate_max_concurrent_usage_task for user_id %(user_id)s " "and date %(date)s.", { "user_id": user_id, "date": date }, )