Beispiel #1
0
def find_or_create_derived_storage(source_image,
                                   transformation_name,
                                   preferred_location,
                                   varying_metadata=None):
    existing = find_derived_storage_for_image(source_image,
                                              transformation_name,
                                              varying_metadata)
    if existing is not None:
        return existing

    uniqueness_hash = _get_uniqueness_hash(varying_metadata)
    trans = ImageStorageTransformation.get(name=transformation_name)
    new_storage = storage.create_v1_storage(preferred_location)

    try:
        derived = DerivedStorageForImage.create(
            source_image=source_image,
            derivative=new_storage,
            transformation=trans,
            uniqueness_hash=uniqueness_hash,
        )
    except IntegrityError:
        # Storage was created while this method executed. Just return the existing.
        ImageStoragePlacement.delete().where(
            ImageStoragePlacement.storage == new_storage).execute()
        new_storage.delete_instance()
        return find_derived_storage_for_image(source_image,
                                              transformation_name,
                                              varying_metadata)

    return derived
Beispiel #2
0
def get_or_create_shared_blob(digest, byte_data, storage):
    """
    Returns the ImageStorage blob with the given digest or, if not present, adds a row and writes
    the given byte data to the storage engine.

    This method is *only* to be used for shared blobs that are globally accessible, such as the
    special empty gzipped tar layer that Docker no longer pushes to us.
    """
    assert digest
    assert byte_data is not None and isinstance(byte_data, bytes)
    assert storage

    try:
        return ImageStorage.get(content_checksum=digest)
    except ImageStorage.DoesNotExist:
        preferred = storage.preferred_locations[0]
        location_obj = ImageStorageLocation.get(name=preferred)

        record = ImageStorage.create(image_size=len(byte_data),
                                     content_checksum=digest)

        try:
            storage.put_content([preferred],
                                storage_model.get_layer_path(record),
                                byte_data)
            ImageStoragePlacement.create(storage=record, location=location_obj)
        except:
            logger.exception("Exception when trying to write special layer %s",
                             digest)
            record.delete_instance()
            raise

        return record
Beispiel #3
0
def add_storage_placement(storage, location_name):
    """ Adds a storage placement for the given storage at the given location. """
    location = get_image_location_for_name(location_name)
    try:
        ImageStoragePlacement.create(location=location.id, storage=storage)
    except IntegrityError:
        # Placement already exists. Nothing to do.
        pass
Beispiel #4
0
def find_broken_storages():
    broken_storages = set()

    print("Checking storages...")
    placement_count = ImageStoragePlacement.select().count()
    placements = (ImageStoragePlacement.select().join(ImageStorage).switch(
        ImageStoragePlacement).join(ImageStorageLocation))

    for placement in tqdm(placements, total=placement_count):
        path = model.storage.get_layer_path(placement.storage)
        if not storage_system.exists([placement.location.name], path):
            broken_storages.add(placement.storage.id)

    return list(broken_storages)
def backfill_replication():
    encountered = set()
    query = (
        Image.select(Image, ImageStorage, Repository, User)
        .join(ImageStorage)
        .switch(Image)
        .join(Repository)
        .join(User)
    )

    for image in query:
        if image.storage.uuid in encountered:
            continue

        namespace = image.repository.namespace_user
        locations = model.user.get_region_locations(namespace)
        locations_required = locations | set(storage.default_locations)

        query = (
            ImageStoragePlacement.select(ImageStoragePlacement, ImageStorageLocation)
            .where(ImageStoragePlacement.storage == image.storage)
            .join(ImageStorageLocation)
        )

        existing_locations = set([p.location.name for p in query])
        locations_missing = locations_required - existing_locations
        if locations_missing:
            print("Enqueueing image storage %s to be replicated" % (image.storage.uuid))
            encountered.add(image.storage.uuid)

            if not image_replication_queue.alive([image.storage.uuid]):
                queue_storage_replication(image.repository.namespace_user.username, image.storage)
Beispiel #6
0
    def test_store_blob_on_first_time_download(self, proxy_manifest_response):
        proxy_mock = proxy_manifest_response(
            self.tag, HELLO_WORLD_SCHEMA2_MANIFEST_JSON,
            DOCKER_SCHEMA2_MANIFEST_CONTENT_TYPE)
        params = {
            "repository": self.repository,
            "digest": self.digest,
        }

        with patch("data.registry_model.registry_proxy_model.Proxy",
                   MagicMock(return_value=proxy_mock)):
            with patch("endpoints.v2.blob.model_cache",
                       NoopDataModelCache(TEST_CACHE_CONFIG)):
                conduct_call(
                    self.client,
                    "v2.download_blob",
                    url_for,
                    "GET",
                    params,
                    expected_code=200,
                    headers=self.headers,
                )

        path = get_layer_path(self.blob)
        assert path is not None

        placements = ImageStoragePlacement.filter(
            ImageStoragePlacement.storage == self.blob)
        locations = [placements.get().location.name]
        assert storage.exists(
            locations, path), f"blob not found in storage at path {path}"
Beispiel #7
0
    def test_create_blob_placement_on_first_time_download(
            self, proxy_manifest_response):
        proxy_mock = proxy_manifest_response(
            self.tag, HELLO_WORLD_SCHEMA2_MANIFEST_JSON,
            DOCKER_SCHEMA2_MANIFEST_CONTENT_TYPE)
        params = {
            "repository": self.repository,
            "digest": self.digest,
        }

        with patch("data.registry_model.registry_proxy_model.Proxy",
                   MagicMock(return_value=proxy_mock)):
            with patch("endpoints.v2.blob.model_cache",
                       NoopDataModelCache(TEST_CACHE_CONFIG)):
                conduct_call(
                    self.client,
                    "v2.download_blob",
                    url_for,
                    "GET",
                    params,
                    expected_code=200,
                    headers=self.headers,
                )
        placements = ImageStoragePlacement.filter(
            ImageStoragePlacement.storage == self.blob)
        assert placements.count() == 1
Beispiel #8
0
def get_storage_locations(uuid):
    query = (ImageStoragePlacement.select().join(ImageStorage).where(
        ImageStorage.uuid == uuid))

    return [
        get_image_location_for_id(placement.location_id).name
        for placement in query
    ]
    def get_repo_blob_by_digest(self,
                                repository_ref,
                                blob_digest,
                                include_placements=False):
        """
        Returns the blob in the repository with the given digest.

        If the blob is a placeholder, downloads it from the upstream registry.
        Placeholder blobs are blobs that don't yet have a ImageStoragePlacement
        associated with it.

        Note that there may be multiple records in the same repository for the same blob digest, so
        the return value of this function may change.
        """
        blob = self._get_shared_storage(blob_digest)
        if blob is None:
            try:
                blob = (ImageStorage.select().join(ManifestBlob).where(
                    ManifestBlob.repository_id == repository_ref.id,
                    ImageStorage.content_checksum == blob_digest,
                ).get())
            except ImageStorage.DoesNotExist:
                return None

        try:
            ImageStoragePlacement.select().where(
                ImageStoragePlacement.storage == blob).get()
        except ImageStoragePlacement.DoesNotExist:
            try:
                self._download_blob(repository_ref, blob_digest)
            except BlobDigestMismatchException:
                raise UpstreamRegistryError("blob digest mismatch")
            except BlobTooLargeException as e:
                raise UpstreamRegistryError(
                    f"blob too large, max allowed is {e.max_allowed}")
            except BlobRangeMismatchException:
                raise UpstreamRegistryError("range mismatch")
            except BlobUploadException:
                raise UpstreamRegistryError("invalid blob upload")

        return super().get_repo_blob_by_digest(repository_ref, blob_digest,
                                               include_placements)
Beispiel #10
0
    def done(self):
        """ Marks the manifest builder as complete and disposes of any state. This call is optional
        and it is expected manifest builders will eventually time out if unused for an
        extended period of time.
    """
        temp_storages = self._builder_state.temp_storages
        for storage_id in temp_storages:
            try:
                storage = ImageStorage.get(id=storage_id)
                if storage.uploading and storage.content_checksum != EMPTY_LAYER_BLOB_DIGEST:
                    # Delete all the placements pointing to the storage.
                    ImageStoragePlacement.delete().where(
                        ImageStoragePlacement.storage == storage).execute()

                    # Delete the storage.
                    storage.delete_instance()
            except ImageStorage.DoesNotExist:
                pass

        session.pop(_SESSION_KEY, None)
Beispiel #11
0
def get_placements_for_images(images):
    """ Returns the placements for the given images, as a map from image storage ID to placements. """
    if not images:
        return {}

    query = (ImageStoragePlacement.select(
        ImageStoragePlacement, ImageStorageLocation,
        ImageStorage).join(ImageStorageLocation).switch(
            ImageStoragePlacement).join(ImageStorage).where(
                ImageStorage.id << [image.storage_id for image in images]))

    placement_map = defaultdict(list)
    for placement in query:
        placement_map[placement.storage.id].append(placement)

    return dict(placement_map)
Beispiel #12
0
def get_image_and_placements(namespace_name, repo_name, docker_image_id):
    """ Returns the repo image (with a storage object) and storage placements for the image
      or (None, None) if non found.
  """
    repo_image = get_repo_image_and_storage(namespace_name, repo_name,
                                            docker_image_id)
    if repo_image is None:
        return (None, None)

    query = (ImageStoragePlacement.select(
        ImageStoragePlacement,
        ImageStorageLocation).join(ImageStorageLocation).switch(
            ImageStoragePlacement).join(ImageStorage).where(
                ImageStorage.id == repo_image.storage_id))

    return repo_image, list(query)
Beispiel #13
0
def _get_storage(query_modifier):
    query = (ImageStoragePlacement.select(
        ImageStoragePlacement,
        ImageStorage).switch(ImageStoragePlacement).join(ImageStorage))

    placements = list(query_modifier(query))

    if not placements:
        raise InvalidImageException()

    found = placements[0].storage
    found.locations = {
        get_image_location_for_id(placement.location_id).name
        for placement in placements
    }
    return found
Beispiel #14
0
def verify_placements():
    encountered = set()

    iterator = yield_random_entries(
        lambda: ImageStorage.select().where(ImageStorage.uploading == False),
        ImageStorage.id,
        1000,
        ImageStorage.select(fn.Max(ImageStorage.id)).scalar(),
        1,
    )

    for storage_row, abt, _ in iterator:
        if storage_row.id in encountered:
            continue

        encountered.add(storage_row.id)

        logger.info("Checking placements for storage `%s`", storage_row.uuid)
        try:
            with_locations = model.storage.get_storage_by_uuid(
                storage_row.uuid)
        except model.InvalidImageException:
            logger.exception("Could not find storage `%s`", storage_row.uuid)
            continue

        storage_path = model.storage.get_layer_path(storage_row)
        locations_to_check = set(with_locations.locations)
        if locations_to_check:
            logger.info("Checking locations `%s` for storage `%s`",
                        locations_to_check, storage_row.uuid)
            for location in locations_to_check:
                logger.info("Checking location `%s` for storage `%s`",
                            location, storage_row.uuid)
                if not storage.exists([location], storage_path):
                    location_row = _get_location_row(location)
                    logger.info(
                        "Location `%s` is missing for storage `%s`; removing",
                        location,
                        storage_row.uuid,
                    )
                    (ImageStoragePlacement.delete().where(
                        ImageStoragePlacement.storage == storage_row,
                        ImageStoragePlacement.location == location_row,
                    ).execute())
Beispiel #15
0
def store_blob_record_and_temp_link_in_repo(
    repository_id,
    blob_digest,
    location_obj,
    byte_count,
    link_expiration_s,
    uncompressed_byte_count=None,
):
    """
    Store a record of the blob and temporarily link it to the specified repository.
    """
    assert blob_digest
    assert byte_count is not None

    with db_transaction():
        try:
            storage = ImageStorage.get(content_checksum=blob_digest)
            save_changes = False

            if storage.image_size is None:
                storage.image_size = byte_count
                save_changes = True

            if storage.uncompressed_size is None and uncompressed_byte_count is not None:
                storage.uncompressed_size = uncompressed_byte_count
                save_changes = True

            if save_changes:
                storage.save()

            ImageStoragePlacement.get(storage=storage, location=location_obj)
        except ImageStorage.DoesNotExist:
            storage = ImageStorage.create(
                content_checksum=blob_digest,
                uploading=False,
                image_size=byte_count,
                uncompressed_size=uncompressed_byte_count,
            )
            ImageStoragePlacement.create(storage=storage,
                                         location=location_obj)
        except ImageStoragePlacement.DoesNotExist:
            ImageStoragePlacement.create(storage=storage,
                                         location=location_obj)

        _temp_link_blob(repository_id, storage, link_expiration_s)
        return storage
Beispiel #16
0
def garbage_collect_storage(storage_id_whitelist):
    """
    Performs GC on a possible subset of the storage's with the IDs found in the whitelist.

    The storages in the whitelist will be checked, and any orphaned will be removed, with those IDs
    being returned.
    """
    if len(storage_id_whitelist) == 0:
        return []

    def placements_to_filtered_paths_set(placements_list):
        """
        Returns the list of paths to remove from storage, filtered from the given placements query
        by removing any CAS paths that are still referenced by storage(s) in the database.
        """
        if not placements_list:
            return set()

        with ensure_under_transaction():
            # Find the content checksums not referenced by other storages. Any that are, we cannot
            # remove.
            content_checksums = set(
                [
                    placement.storage.content_checksum
                    for placement in placements_list
                    if placement.storage.cas_path
                ]
            )

            unreferenced_checksums = set()
            if content_checksums:
                # Check the current image storage.
                query = ImageStorage.select(ImageStorage.content_checksum).where(
                    ImageStorage.content_checksum << list(content_checksums)
                )
                is_referenced_checksums = set(
                    [image_storage.content_checksum for image_storage in query]
                )
                if is_referenced_checksums:
                    logger.warning(
                        "GC attempted to remove CAS checksums %s, which are still IS referenced",
                        is_referenced_checksums,
                    )

                # Check the ApprBlob table as well.
                query = ApprBlob.select(ApprBlob.digest).where(
                    ApprBlob.digest << list(content_checksums)
                )
                appr_blob_referenced_checksums = set([blob.digest for blob in query])
                if appr_blob_referenced_checksums:
                    logger.warning(
                        "GC attempted to remove CAS checksums %s, which are ApprBlob referenced",
                        appr_blob_referenced_checksums,
                    )

                unreferenced_checksums = (
                    content_checksums - appr_blob_referenced_checksums - is_referenced_checksums
                )

            # Return all placements for all image storages found not at a CAS path or with a content
            # checksum that is referenced.
            return {
                (
                    get_image_location_for_id(placement.location_id).name,
                    get_layer_path(placement.storage),
                    placement.storage.content_checksum,
                )
                for placement in placements_list
                if not placement.storage.cas_path
                or placement.storage.content_checksum in unreferenced_checksums
            }

    # Note: Both of these deletes must occur in the same transaction (unfortunately) because a
    # storage without any placement is invalid, and a placement cannot exist without a storage.
    # TODO: We might want to allow for null storages on placements, which would allow us to
    # delete the storages, then delete the placements in a non-transaction.
    logger.debug("Garbage collecting storages from candidates: %s", storage_id_whitelist)
    paths_to_remove = []
    orphaned_storage_ids = set()
    for storage_id_to_check in storage_id_whitelist:
        logger.debug("Garbage collecting storage %s", storage_id_to_check)

        with db_transaction():
            if not _is_storage_orphaned(storage_id_to_check):
                continue

            orphaned_storage_ids.add(storage_id_to_check)

            placements_to_remove = list(
                ImageStoragePlacement.select(ImageStoragePlacement, ImageStorage)
                .join(ImageStorage)
                .where(ImageStorage.id == storage_id_to_check)
            )

            # Remove the placements for orphaned storages
            if placements_to_remove:
                ImageStoragePlacement.delete().where(
                    ImageStoragePlacement.storage == storage_id_to_check
                ).execute()

            # Remove all orphaned storages
            TorrentInfo.delete().where(TorrentInfo.storage == storage_id_to_check).execute()

            ImageStorageSignature.delete().where(
                ImageStorageSignature.storage == storage_id_to_check
            ).execute()

            ImageStorage.delete().where(ImageStorage.id == storage_id_to_check).execute()

            # Determine the paths to remove. We cannot simply remove all paths matching storages, as CAS
            # can share the same path. We further filter these paths by checking for any storages still in
            # the database with the same content checksum.
            paths_to_remove.extend(placements_to_filtered_paths_set(placements_to_remove))

    # We are going to make the conscious decision to not delete image storage blobs inside
    # transactions.
    # This may end up producing garbage in s3, trading off for higher availability in the database.
    paths_to_remove = list(set(paths_to_remove))
    for location_name, image_path, storage_checksum in paths_to_remove:
        if storage_checksum:
            # Skip any specialized blob digests that we know we should keep around.
            if storage_checksum in SPECIAL_BLOB_DIGESTS:
                continue

            # Perform one final check to ensure the blob is not needed.
            if (
                ImageStorage.select()
                .where(ImageStorage.content_checksum == storage_checksum)
                .exists()
            ):
                continue

        logger.debug("Removing %s from %s", image_path, location_name)
        config.store.remove({location_name}, image_path)

    return orphaned_storage_ids
Beispiel #17
0
def garbage_collect_storage(storage_id_whitelist):
    """ Performs GC on a possible subset of the storage's with the IDs found in the
      whitelist. The storages in the whitelist will be checked, and any orphaned will
      be removed, with those IDs being returned.
  """
    if len(storage_id_whitelist) == 0:
        return []

    def placements_to_filtered_paths_set(placements_list):
        """ Returns the list of paths to remove from storage, filtered from the given placements
        query by removing any CAS paths that are still referenced by storage(s) in the database.
    """
        with ensure_under_transaction():
            if not placements_list:
                return set()

            # Find the content checksums not referenced by other storages. Any that are, we cannot
            # remove.
            content_checksums = set([
                placement.storage.content_checksum
                for placement in placements_list if placement.storage.cas_path
            ])

            unreferenced_checksums = set()
            if content_checksums:
                # Check the current image storage.
                query = (ImageStorage.select(
                    ImageStorage.content_checksum
                ).where(
                    ImageStorage.content_checksum << list(content_checksums)))
                is_referenced_checksums = set([
                    image_storage.content_checksum for image_storage in query
                ])
                if is_referenced_checksums:
                    logger.warning(
                        'GC attempted to remove CAS checksums %s, which are still IS referenced',
                        is_referenced_checksums)

                # Check the ApprBlob table as well.
                query = ApprBlob.select(ApprBlob.digest).where(
                    ApprBlob.digest << list(content_checksums))
                appr_blob_referenced_checksums = set(
                    [blob.digest for blob in query])
                if appr_blob_referenced_checksums:
                    logger.warning(
                        'GC attempted to remove CAS checksums %s, which are ApprBlob referenced',
                        appr_blob_referenced_checksums)

                unreferenced_checksums = (content_checksums -
                                          appr_blob_referenced_checksums -
                                          is_referenced_checksums)

            # Return all placements for all image storages found not at a CAS path or with a content
            # checksum that is referenced.
            return {
                (get_image_location_for_id(placement.location_id).name,
                 get_layer_path(placement.storage))
                for placement in placements_list
                if not placement.storage.cas_path
                or placement.storage.content_checksum in unreferenced_checksums
            }

    # Note: Both of these deletes must occur in the same transaction (unfortunately) because a
    # storage without any placement is invalid, and a placement cannot exist without a storage.
    # TODO: We might want to allow for null storages on placements, which would allow us to
    # delete the storages, then delete the placements in a non-transaction.
    logger.debug('Garbage collecting storages from candidates: %s',
                 storage_id_whitelist)
    with db_transaction():
        orphaned_storage_ids = _orphaned_storage_query(storage_id_whitelist)
        if len(orphaned_storage_ids) == 0:
            # Nothing to GC.
            return []

        placements_to_remove = list(
            ImageStoragePlacement.select(
                ImageStoragePlacement, ImageStorage).join(ImageStorage).where(
                    ImageStorage.id << orphaned_storage_ids))

        # Remove the placements for orphaned storages
        if len(placements_to_remove) > 0:
            placement_ids_to_remove = [
                placement.id for placement in placements_to_remove
            ]
            placements_removed = (ImageStoragePlacement.delete().where(
                ImageStoragePlacement.id << placement_ids_to_remove).execute())
            logger.debug('Removed %s image storage placements',
                         placements_removed)

        # Remove all orphaned storages
        torrents_removed = (TorrentInfo.delete().where(
            TorrentInfo.storage << orphaned_storage_ids).execute())
        logger.debug('Removed %s torrent info records', torrents_removed)

        signatures_removed = (ImageStorageSignature.delete().where(
            ImageStorageSignature.storage << orphaned_storage_ids).execute())
        logger.debug('Removed %s image storage signatures', signatures_removed)

        storages_removed = (ImageStorage.delete().where(
            ImageStorage.id << orphaned_storage_ids).execute())
        logger.debug('Removed %s image storage records', storages_removed)

        # Determine the paths to remove. We cannot simply remove all paths matching storages, as CAS
        # can share the same path. We further filter these paths by checking for any storages still in
        # the database with the same content checksum.
        paths_to_remove = placements_to_filtered_paths_set(
            placements_to_remove)

    # We are going to make the conscious decision to not delete image storage blobs inside
    # transactions.
    # This may end up producing garbage in s3, trading off for higher availability in the database.
    for location_name, image_path in paths_to_remove:
        logger.debug('Removing %s from %s', image_path, location_name)
        config.store.remove({location_name}, image_path)

    return orphaned_storage_ids
Beispiel #18
0
def create_v1_storage(location_name):
    storage = ImageStorage.create(cas_path=False, uploading=True)
    location = get_image_location_for_name(location_name)
    ImageStoragePlacement.create(location=location.id, storage=storage)
    storage.locations = {location_name}
    return storage
Beispiel #19
0
    def test_create_placeholder_blobs_on_first_pull(self, test_name,
                                                    proxy_manifest_response):
        test_params = storage_test_cases[test_name]
        # no blob placeholders are created for manifest lists - we don't have
        # the sub-manifests at manifest list creation time, so there's no way
        # to know which blobs the sub-manifest has.
        if test_params["manifest_type"] in [
                DOCKER_SCHEMA2_MANIFESTLIST_CONTENT_TYPE,
                OCI_IMAGE_INDEX_CONTENT_TYPE,
        ]:
            pytest.skip(
                "manifest list detected - skipping blob placeholder test")

        repo = f"{self.orgname}/{test_params['image_name']}"
        params = {
            "repository": repo,
            "manifest_ref": test_params["manifest_ref"],
        }

        proxy_mock = proxy_manifest_response(test_params["manifest_ref"],
                                             test_params["manifest_json"],
                                             test_params["manifest_type"])
        with patch("data.registry_model.registry_proxy_model.Proxy",
                   MagicMock(return_value=proxy_mock)):
            headers = _get_auth_headers(self.sub, self.ctx, repo)
            headers["Accept"] = ", ".join(
                DOCKER_SCHEMA2_CONTENT_TYPES.union(OCI_CONTENT_TYPES).union(
                    DOCKER_SCHEMA1_CONTENT_TYPES))
            conduct_call(
                self.client,
                test_params["view_name"],
                url_for,
                "GET",
                params,
                expected_code=200,
                headers=headers,
            )

        parsed = parse_manifest_from_bytes(
            Bytes.for_string_or_unicode(test_params["manifest_json"]),
            test_params["manifest_type"],
            sparse_manifest_support=True,
        )
        manifest = Manifest.filter(Manifest.digest == parsed.digest).get()
        mdict = parsed.manifest_dict
        layers = mdict.get("layers", mdict.get("fsLayers"))
        mblobs = ManifestBlob.filter(ManifestBlob.manifest == manifest)

        expected_count = len(layers)

        # schema 2 manifests have an extra config blob which we need to take into
        # consideration in the total count
        config_digest = ""
        if parsed.schema_version == 2:
            config_digest = parsed.config.digest
            expected_count += 1

        assert mblobs.count() == expected_count

        for mblob in mblobs:
            blob = None
            layer = None

            # don't assert if digest belongs to a config blob
            if mblob.blob.content_checksum == config_digest:
                continue

            for layer in layers:
                digest = layer.get("digest", layer.get("blobSum"))
                if mblob.blob.content_checksum == digest:
                    blob = mblob.blob
                    layer = layer
                    break

            assert blob is not None
            assert blob.image_size == layer.get("size", None)

            # the absence of an image storage placement for a blob indicates that it's
            # a placeholder blob, not yet downloaded from the upstream registry.
            placements = ImageStoragePlacement.filter(
                ImageStoragePlacement.storage == blob)
            assert placements.count() == 0