def _lookup_repo_storages_by_content_checksum(repo, checksums, model_class): assert checksums # There may be many duplicates of the checksums, so for performance reasons we are going # to use a union to select just one storage with each checksum queries = [] for counter, checksum in enumerate(checksums): query_alias = "q{0}".format(counter) candidate_subq = ( ImageStorage.select( ImageStorage.id, ImageStorage.content_checksum, ImageStorage.image_size, ImageStorage.uuid, ImageStorage.cas_path, ImageStorage.uncompressed_size, ImageStorage.uploading, ) .join(model_class) .where(model_class.repository == repo, ImageStorage.content_checksum == checksum) .limit(1) .alias(query_alias) ) queries.append(ImageStorage.select(SQL("*")).from_(candidate_subq)) assert queries return _basequery.reduce_as_tree(queries)
def _orphaned_storage_query(candidate_ids): """ Returns the subset of the candidate ImageStorage IDs representing storages that are no longer referenced by images. """ # Issue a union query to find all storages that are still referenced by a candidate storage. This # is much faster than the group_by and having call we used to use here. nonorphaned_queries = [] for counter, candidate_id in enumerate(candidate_ids): query_alias = 'q{0}'.format(counter) # TODO: remove the join with Image once fully on the OCI data model. storage_subq = (ImageStorage.select(ImageStorage.id).join(Image).where( ImageStorage.id == candidate_id).limit(1).alias(query_alias)) nonorphaned_queries.append( ImageStorage.select(SQL('*')).from_(storage_subq)) manifest_storage_subq = (ImageStorage.select( ImageStorage.id).join(ManifestBlob).where( ImageStorage.id == candidate_id).limit(1).alias(query_alias)) nonorphaned_queries.append( ImageStorage.select(SQL('*')).from_(manifest_storage_subq)) # Build the set of storages that are missing. These storages are orphaned. nonorphaned_storage_ids = { storage.id for storage in _basequery.reduce_as_tree(nonorphaned_queries) } return list(candidate_ids - nonorphaned_storage_ids)
def lookup_repo_storages_by_content_checksum(repo, checksums, by_manifest=False): """ Looks up repository storages (without placements) matching the given repository and checksum. """ if not checksums: return [] # There may be many duplicates of the checksums, so for performance reasons we are going # to use a union to select just one storage with each checksum queries = [] for counter, checksum in enumerate(set(checksums)): query_alias = "q{0}".format(counter) # TODO: Remove once we have a new-style model for tracking temp uploaded blobs and # all legacy tables have been removed. if by_manifest: candidate_subq = (ImageStorage.select( ImageStorage.id, ImageStorage.content_checksum, ImageStorage.image_size, ImageStorage.uuid, ImageStorage.cas_path, ImageStorage.uncompressed_size, ImageStorage.uploading, ).join(ManifestBlob).where( ManifestBlob.repository == repo, ImageStorage.content_checksum == checksum).limit(1).alias(query_alias)) else: candidate_subq = (ImageStorage.select( ImageStorage.id, ImageStorage.content_checksum, ImageStorage.image_size, ImageStorage.uuid, ImageStorage.cas_path, ImageStorage.uncompressed_size, ImageStorage.uploading, ).join(Image).where(Image.repository == repo, ImageStorage.content_checksum == checksum).limit(1).alias(query_alias)) queries.append(ImageStorage.select(SQL("*")).from_(candidate_subq)) return _basequery.reduce_as_tree(queries)
def get_matching_tags_for_images(image_pairs, filter_images=None, filter_tags=None, selections=None): """ Returns all tags that contain the images with the given docker_image_id and storage_uuid, as specified as an iterable of pairs. """ if not image_pairs: return [] image_pairs_set = set(image_pairs) # Find all possible matching image+storages. images = [] while image_pairs: image_pairs_slice = image_pairs[:_MAX_IMAGE_LOOKUP_COUNT] ids = [pair[0] for pair in image_pairs_slice] uuids = [pair[1] for pair in image_pairs_slice] images_query = (Image.select( Image.id, Image.docker_image_id, Image.ancestors, ImageStorage.uuid).join(ImageStorage).where( Image.docker_image_id << ids, ImageStorage.uuid << uuids).switch(Image)) if filter_images is not None: images_query = filter_images(images_query) images.extend(list(images_query)) image_pairs = image_pairs[_MAX_IMAGE_LOOKUP_COUNT:] # Filter down to those images actually in the pairs set and build the set of queries to run. individual_image_queries = [] for img in images: # Make sure the image found is in the set of those requested, and that we haven't already # processed it. We need this check because the query above checks for images with matching # IDs OR storage UUIDs, rather than the expected ID+UUID pair. We do this for efficiency # reasons, and it is highly unlikely we'll find an image with a mismatch, but we need this # check to be absolutely sure. pair = (img.docker_image_id, img.storage.uuid) if pair not in image_pairs_set: continue # Remove the pair so we don't try it again. image_pairs_set.remove(pair) ancestors_str = "%s%s/%%" % (img.ancestors, img.id) query = Image.select( Image.id).where((Image.id == img.id) | (Image.ancestors**ancestors_str)) individual_image_queries.append(query) if not individual_image_queries: return [] # Shard based on the max subquery count. This is used to prevent going over the DB's max query # size, as well as to prevent the DB from locking up on a massive query. sharded_queries = [] while individual_image_queries: shard = individual_image_queries[:_MAX_SUB_QUERIES] sharded_queries.append(_basequery.reduce_as_tree(shard)) individual_image_queries = individual_image_queries[_MAX_SUB_QUERIES:] # Collect IDs of the tags found for each query. tags = {} for query in sharded_queries: ImageAlias = Image.alias() tag_query = _tag_alive( RepositoryTag.select( *(selections or [])).distinct().join(ImageAlias).where( RepositoryTag.hidden == False).where( ImageAlias.id << query).switch(RepositoryTag)) if filter_tags is not None: tag_query = filter_tags(tag_query) for tag in tag_query: tags[tag.id] = tag return tags.values()