Example #1
0
    async def run(self):
        """Signs collections if they have not been signed with key."""
        tasks = []
        # Filter out any content that already has a signature with pubkey_fingerprint
        current_signatures = CollectionVersionSignature.objects.filter(
            pubkey_fingerprint=self.signing_service.pubkey_fingerprint)
        new_content = self.content.exclude(signatures__in=current_signatures)
        ntotal = await sync_to_async(new_content.count)()
        nmsg = _("Signing new CollectionVersions")
        async with ProgressReport(message=nmsg,
                                  code="sign.new.signature",
                                  total=ntotal) as p:
            self.progress_report = p
            async for collection_version in sync_to_async_iterable(
                    new_content.iterator()):
                tasks.append(
                    asyncio.create_task(
                        self.sign_collection_version(collection_version)))
            await asyncio.gather(*tasks)

        # Add any signatures already present in Pulp if part of content list
        present_content = current_signatures.filter(
            signed_collection__in=self.content).exclude(
                pk__in=self.repos_current_signatures)
        ptotal = await sync_to_async(present_content.count)()
        pmsg = _("Adding present CollectionVersionSignatures")
        async with ProgressReport(message=pmsg,
                                  code="sign.present.signature",
                                  total=ptotal) as np:
            async for signature in sync_to_async_iterable(
                    present_content.iterator()):
                await np.aincrement()
                await self.put(DeclarativeContent(content=signature))
Example #2
0
async def _repair_ca(content_artifact, repaired=None):
    remote_artifacts = sync_to_async_iterable(
        content_artifact.remoteartifact_set.all().select_related("remote")
    )

    if not remote_artifacts:
        log.warn(
            _("Artifact {} is unrepairable - no remote source".format(content_artifact.artifact))
        )
        return False

    async for remote_artifact in remote_artifacts:
        detail_remote = await sync_to_async(remote_artifact.remote.cast)()
        downloader = detail_remote.get_downloader(remote_artifact)
        dl_result = await downloader.run()
        if dl_result.artifact_attributes["sha256"] == content_artifact.artifact.sha256:
            with open(dl_result.path, "rb") as src:
                filename = content_artifact.artifact.file.name
                await sync_to_async(content_artifact.artifact.file.delete)(save=False)
                await sync_to_async(content_artifact.artifact.file.save)(filename, src, save=False)
            if repaired is not None:
                await repaired.aincrement()
            return True
        log.warn(_("Redownload failed from {}.").format(remote_artifact.url))

    return False
Example #3
0
    async def run(self):
        """
        The coroutine for this stage.

        Returns:
            The coroutine for this stage.
        """
        async for batch in self.batches():
            artifact_digests_by_type = defaultdict(list)

            # For each unsaved artifact, check its digests in the order of COMMON_DIGEST_FIELDS
            # and the first digest which is found is added to the list of digests of that type.
            # We assume that in general only one digest is provided and that it will be
            # sufficient to identify the Artifact.
            for d_content in batch:
                for d_artifact in d_content.d_artifacts:
                    if d_artifact.artifact._state.adding:
                        if not d_artifact.deferred_download:
                            _check_for_forbidden_checksum_type(
                                d_artifact.artifact)
                        for digest_type in Artifact.COMMON_DIGEST_FIELDS:
                            digest_value = getattr(d_artifact.artifact,
                                                   digest_type)
                            if digest_value:
                                artifact_digests_by_type[digest_type].append(
                                    digest_value)
                                break

            # For each type of digest, fetch all the existing Artifacts where digest "in"
            # the list we built earlier. Walk over all the artifacts again compare the
            # digest of the new artifact to those of the existing ones - if one matches,
            # swap it out with the existing one.
            for digest_type, digests in artifact_digests_by_type.items():
                query_params = {"{attr}__in".format(attr=digest_type): digests}
                existing_artifacts_qs = Artifact.objects.filter(**query_params)
                existing_artifacts = sync_to_async_iterable(
                    existing_artifacts_qs)
                await sync_to_async(existing_artifacts_qs.touch)()
                for d_content in batch:
                    for d_artifact in d_content.d_artifacts:
                        artifact_digest = getattr(d_artifact.artifact,
                                                  digest_type)
                        if artifact_digest:
                            async for result in existing_artifacts:
                                result_digest = getattr(result, digest_type)
                                if result_digest == artifact_digest:
                                    d_artifact.artifact = result
                                    break
            for d_content in batch:
                await self.put(d_content)
Example #4
0
async def _repair_artifacts_for_content(subset=None, verify_checksums=True):
    loop = asyncio.get_event_loop()
    pending = set()

    query_set = models.ContentArtifact.objects.exclude(artifact__isnull=True)

    if subset is not None and await sync_to_async(subset.exists)():
        query_set = query_set.filter(content__in=subset)

    async with ProgressReport(
        message="Identify missing units", code="repair.missing"
    ) as missing, ProgressReport(
        message="Identify corrupted units", code="repair.corrupted"
    ) as corrupted, ProgressReport(
        message="Repair corrupted units", code="repair.repaired"
    ) as repaired:

        with ThreadPoolExecutor(max_workers=2) as checksum_executor:
            async for content_artifact in sync_to_async_iterable(
                query_set.select_related("artifact").iterator()
            ):
                artifact = content_artifact.artifact

                valid = await loop.run_in_executor(None, default_storage.exists, artifact.file.name)
                if not valid:
                    await missing.aincrement()
                    log.warn(_("Missing file for {}").format(artifact))
                elif verify_checksums:
                    # default ThreadPoolExecutor uses num cores x 5 threads. Since we're doing
                    # such long and sequential reads, using too many threads might hurt more
                    # than help (on HDDs, maybe not on SSDs) by making the disk access pattern
                    # more random. Put it in a separate executor with limited threads.
                    # Should stay in (an) executor so that at least it doesn't completely block
                    # downloads.
                    valid = await loop.run_in_executor(
                        checksum_executor, _verify_artifact, artifact
                    )
                    if not valid:
                        await corrupted.aincrement()
                        log.warn(_("Digest mismatch for {}").format(artifact))

                if not valid:
                    if len(pending) >= 5:  # Limit the number of concurrent repair tasks
                        done, pending = await asyncio.wait(
                            pending, return_when=asyncio.FIRST_COMPLETED
                        )
                        await asyncio.gather(*done)  # Clean up tasks
                    pending.add(asyncio.ensure_future(_repair_ca(content_artifact, repaired)))
        await asyncio.gather(*pending)
Example #5
0
    async def run(self):
        """
        The coroutine for this stage.

        Returns:
            The coroutine for this stage.
        """
        async with ProgressReport(message="Associating Content",
                                  code="associating.content") as pb:
            to_delete = {
                i
                async for i in sync_to_async_iterable(
                    self.new_version.content.values_list("pk", flat=True))
            }

            async for batch in self.batches():
                to_add = set()
                for d_content in batch:
                    try:
                        to_delete.remove(d_content.content.pk)
                    except KeyError:
                        to_add.add(d_content.content.pk)
                        await self.put(d_content)

                if to_add:
                    await sync_to_async(self.new_version.add_content
                                        )(Content.objects.filter(pk__in=to_add)
                                          )
                    await pb.aincrease_by(len(to_add))

            if self.allow_delete:
                async with ProgressReport(message="Un-Associating Content",
                                          code="unassociating.content") as pb:
                    if to_delete:
                        await sync_to_async(self.new_version.remove_content)(
                            Content.objects.filter(pk__in=to_delete))
                        await pb.aincrease_by(len(to_delete))
Example #6
0
    async def run(self):
        """
        The coroutine for this stage.

        Returns:
            The coroutine for this stage.
        """
        async for batch in self.batches():
            content_q_by_type = defaultdict(lambda: Q(pk__in=[]))
            d_content_by_nat_key = defaultdict(list)
            for d_content in batch:
                if d_content.content._state.adding:
                    model_type = type(d_content.content)
                    unit_q = d_content.content.q()
                    content_q_by_type[
                        model_type] = content_q_by_type[model_type] | unit_q
                    d_content_by_nat_key[
                        d_content.content.natural_key()].append(d_content)

            for model_type, content_q in content_q_by_type.items():
                try:
                    await sync_to_async(
                        model_type.objects.filter(content_q).touch)()
                except AttributeError:
                    raise TypeError(
                        "Plugins which declare custom ORM managers on their content classes "
                        "should have those managers inherit from "
                        "pulpcore.plugin.models.ContentManager.")
                async for result in sync_to_async_iterable(
                        model_type.objects.filter(content_q).iterator()):
                    for d_content in d_content_by_nat_key[
                            result.natural_key()]:
                        d_content.content = result

            for d_content in batch:
                await self.put(d_content)
Example #7
0
    async def migrate_to_pulp3(self, content_model, content_type):
        """
        A default implementation of DeclarativeContent creation for migrating content to Pulp 3.

        Plugin writers might want to override this method if it doesn't satisfy their needs as is.

        In this implementation there is an assumption that each content has one artifact.

        Args:
            batch: A batch of Pulp2Content objects to migrate to Pulp 3
            migrator: A plugin migrator to be used
            content_type: type of pulp2 content that is being mirated
        """
        @functools.lru_cache(maxsize=20)
        def get_remote_by_importer_id(importer_id):
            """
            Args:
                importer_id(str): Id of an importer in Pulp 2

            Returns:
                remote(pulpcore.app.models.Remote): A corresponding remote in Pulp 3

            """
            try:
                pulp2importer = Pulp2Importer.objects.get(
                    pulp2_object_id=importer_id)
            except ObjectDoesNotExist:
                return
            return pulp2importer.pulp3_remote

        futures = []
        is_lazy_type = content_type in self.migrator.lazy_types
        is_artifactless_type = content_type in self.migrator.artifactless_types
        has_future = content_type in self.migrator.future_types
        is_multi_artifact = content_type in self.migrator.multi_artifact_types

        if is_lazy_type:
            # go through all of the content that haven't been migrated OR have been migrated
            # but have new lazy catalog entries.
            units_with_new_lces = (Pulp2LazyCatalog.objects.filter(
                is_migrated=False).values("pulp2_unit_id").distinct())
            already_migrated = ~Q(pulp2content__pulp3_content=None)
            no_new_lces = ~Q(pulp2content__pulp2_id__in=units_with_new_lces)
            pulp_2to3_detail_qs = content_model.objects.exclude(
                already_migrated & no_new_lces)
        else:
            # go through all of the content that haven't been migrated
            pulp_2to3_detail_qs = content_model.objects.filter(
                pulp2content__pulp3_content=None)

        # order by pulp2_repo if it's set
        if content_model.set_pulp2_repo:
            pulp_2to3_detail_qs = pulp_2to3_detail_qs.order_by("repo_id")

        async with ProgressReport(
                message="Migrating {} content to Pulp 3".format(content_type),
                code="migrating.{}.content".format(self.migrator.pulp2_plugin),
                total=await sync_to_async(pulp_2to3_detail_qs.count)(),
        ) as pb:
            select_extra = [
                "pulp2content",
                "pulp2content__pulp3_content",
            ]

            if content_model.set_pulp2_repo:
                select_extra.append("pulp2content__pulp2_repo")

            pulp_2to3_detail_qs = pulp_2to3_detail_qs.select_related(
                *select_extra)
            async for pulp_2to3_detail_content in sync_to_async_iterable(
                    pulp_2to3_detail_qs.iterator(chunk_size=800)):
                dc = None
                pulp2content = await sync_to_async(
                    Pulp2Content.objects.get
                )(pk=pulp_2to3_detail_content.pulp2content.pk)

                # only content that supports on_demand download can have entries in LCE
                if is_lazy_type:
                    # get all Lazy Catalog Entries (LCEs) for this content
                    pulp2lazycatalog = Pulp2LazyCatalog.objects.filter(
                        pulp2_unit_id=pulp2content.pulp2_id,
                        is_migrated=False,
                    )
                    await sync_to_async(bool)(pulp2lazycatalog
                                              )  # force queryset to evaluate

                    if not pulp2content.downloaded and not pulp2lazycatalog:
                        # A distribution tree can be from an on_demand repo but without any images,
                        # e.g. CentOS 8 High Availability. Do not skip in that case.
                        if not is_multi_artifact:
                            _logger.warn(
                                _("On_demand content cannot be migrated without an entry in the "
                                  "lazy catalog, pulp2 unit_id: {}".format(
                                      pulp2content.pulp2_id)))
                            continue

                if (pulp2content.pulp3_content is not None and is_lazy_type
                        and pulp2lazycatalog):
                    # find already created pulp3 content
                    pulp3content = pulp2content.pulp3_content
                    extra_info = None
                    if is_multi_artifact:
                        extra_info = pulp_2to3_detail_content.get_treeinfo_serialized(
                        )
                        # If we can't find the .treeinfo for the Distribution, warn and skip
                        if extra_info is None:
                            _logger.warning(
                                _("Failed to find or instantiate extra_info for multi-artifact "
                                  "pulp2 unit_id: {} ; skipping".format(
                                      pulp2content.pulp2_id)))
                            continue
                else:
                    # create pulp3 content and assign relations if present
                    pulp3content, extra_info = await sync_to_async(
                        pulp_2to3_detail_content.create_pulp3_content)()

                # If we can't find/create the Distribution, warn and skip
                if pulp3content is None:
                    _logger.warning(
                        _("Failed to find or instantiate pulp3 content for pulp2 unit_id: {} ;"
                          " skipping".format(pulp2content.pulp2_id)))
                    continue

                future_relations = {"pulp2content": pulp2content}
                if extra_info:
                    future_relations.update(extra_info)

                if is_multi_artifact:
                    d_artifacts = []
                    base_path = pulp2content.pulp2_storage_path
                    remotes = set()
                    missing_artifact = False
                    remote_declarative_artifacts = []

                    for image_relative_path in extra_info["download"][
                            "images"]:
                        remote_url_tuples = []
                        image_path = os.path.join(base_path,
                                                  image_relative_path)
                        downloaded = os.path.exists(image_path)
                        if downloaded:
                            artifact = await self.create_artifact(
                                image_path, None, None, downloaded=downloaded)
                            if artifact is None:
                                continue
                        else:
                            artifact = Artifact()

                        lces = await sync_to_async(list)(
                            pulp2lazycatalog.filter(
                                pulp2_storage_path=image_path))

                        if not lces and not downloaded:
                            continue

                        # collect all urls and respective migrated remotes for the image
                        for lce in lces:
                            remote = await sync_to_async(
                                get_remote_by_importer_id)(
                                    lce.pulp2_importer_id)
                            if remote:
                                remotes.add(remote)
                                remote_url_tuples.append(
                                    (remote, lce.pulp2_url))

                        for remote, url in remote_url_tuples:
                            da = DeclarativeArtifact(
                                artifact=artifact,
                                url=lce.pulp2_url,
                                relative_path=image_relative_path,
                                remote=remote,
                                deferred_download=not downloaded,
                            )
                            remote_declarative_artifacts.append(da)

                        if not remote_url_tuples:
                            # either no LCEs existed but it's a downloaded content (and we can
                            # proceed), or remotes for any of LCEs haven't been migrated (and
                            # nothing can be done at this point)
                            if not downloaded:
                                missing_artifact = True
                                break

                            da = DeclarativeArtifact(
                                artifact=artifact,
                                url=NOT_USED,
                                relative_path=image_relative_path,
                                remote=None,
                                deferred_download=False,
                            )
                            d_artifacts.append(da)

                        d_artifacts.extend(remote_declarative_artifacts)

                    # Only skip the rest of the steps if there are any images that are expected
                    # to be downloaded. There are distribution trees without images in the wild,
                    # e.g. CentOS 8 High Availability.
                    if missing_artifact and extra_info["download"]["images"]:
                        _logger.warn(
                            _("On_demand content cannot be migrated without a remote "
                              "pulp2 unit_id: {}".format(
                                  pulp2content.pulp2_id)))
                        continue

                    for lce in pulp2lazycatalog:
                        lce.is_migrated = True
                    future_relations.update({"lces": list(pulp2lazycatalog)})

                    # We do this last because we need the remote url which is only found in the LCE
                    # of the image files. There is no LCE for the .treeinfo file itself.
                    relative_path = (pulp_2to3_detail_content.
                                     relative_path_for_content_artifact)
                    treeinfo_path = os.path.join(
                        pulp2content.pulp2_storage_path, relative_path)
                    artifact = await self.create_artifact(treeinfo_path,
                                                          None,
                                                          None,
                                                          downloaded=True)
                    if artifact is None:
                        continue
                    if remotes:
                        for remote in remotes:
                            da = DeclarativeArtifact(
                                artifact=artifact,
                                url=os.path.join(remote.url, relative_path),
                                relative_path=relative_path,
                                remote=remote,
                                deferred_download=False,
                            )
                            d_artifacts.append(da)
                    else:
                        da = DeclarativeArtifact(
                            artifact=artifact,
                            url=NOT_USED,
                            relative_path=relative_path,
                            remote=None,
                            deferred_download=False,
                        )
                        d_artifacts.append(da)
                    dc = DeclarativeContent(content=pulp3content,
                                            d_artifacts=d_artifacts)
                    dc.extra_data = future_relations
                    await self.put(dc)
                # not all content units have files, create DC without artifact
                elif is_artifactless_type:
                    # dc without artifact
                    dc = DeclarativeContent(content=pulp3content)
                    dc.extra_data = future_relations
                    await self.put(dc)
                else:

                    # create artifact for content that has file
                    artifact = await self.create_artifact(
                        pulp2content.pulp2_storage_path,
                        pulp_2to3_detail_content.expected_digests,
                        pulp_2to3_detail_content.expected_size,
                        downloaded=pulp2content.downloaded,
                    )
                    if artifact is None:
                        if pb:
                            await pb.aincrement()
                        continue

                    relative_path = (pulp_2to3_detail_content.
                                     relative_path_for_content_artifact)
                    remote_lce_tuples = []
                    deferred_download = not pulp2content.downloaded

                    if is_lazy_type and pulp2lazycatalog:
                        for lce in pulp2lazycatalog:
                            remote = await sync_to_async(
                                get_remote_by_importer_id)(
                                    lce.pulp2_importer_id)
                            if remote:
                                remote_lce_tuples.append((remote, lce))

                        # handle DA and RA creation for content that supports on_demand
                        # Downloaded or on_demand content with LCEs.
                        #
                        # To create multiple remote artifacts, create multiple instances of
                        # declarative content which will differ by url/remote in their
                        # declarative artifacts

                    if remote_lce_tuples:
                        for remote, lce in remote_lce_tuples:
                            da = DeclarativeArtifact(
                                artifact=artifact,
                                url=lce.pulp2_url,
                                relative_path=relative_path,
                                remote=remote,
                                deferred_download=deferred_download,
                            )
                            lce.is_migrated = True
                            dc = DeclarativeContent(content=pulp3content,
                                                    d_artifacts=[da])

                            # yes, all LCEs are assigned for each dc to be resolved at a later
                            # stage. Some LCEs might be "bad" and not have a migrated importer
                            # but we still need to resolved such. It creates some duplicated LCEs
                            # to process later but ensures that all are resolved if at least one
                            # valid one is migrated.
                            future_relations.update(
                                {"lces": list(pulp2lazycatalog)})
                            dc.extra_data = future_relations
                            await self.put(dc)

                    else:
                        # No migratable LCE available
                        if deferred_download:
                            _logger.warn(
                                _("On_demand content cannot be migrated without a remote "
                                  "pulp2 unit_id: {}".format(
                                      pulp2content.pulp2_id)))
                            continue

                        da = DeclarativeArtifact(
                            artifact=artifact,
                            url=NOT_USED,
                            relative_path=relative_path,
                            remote=None,
                            deferred_download=False,
                        )
                        dc = DeclarativeContent(content=pulp3content,
                                                d_artifacts=[da])
                        dc.extra_data = future_relations
                        await self.put(dc)

                if pb:
                    await pb.aincrement()

                if has_future and dc:
                    futures.append(dc)
                resolve_futures = len(futures) >= DEFAULT_BATCH_SIZE
                if resolve_futures:
                    for dc in futures:
                        await dc.resolution()
                    futures.clear()

            # resolve futures if there are any left
            for dc in futures:
                await dc.resolution()
            futures.clear()
Example #8
0
    async def _handle_remote_artifacts(self, batch):
        """
        Build a list of only :class:`~pulpcore.plugin.models.RemoteArtifact` that need
        to be created for the batch.

        Args:
            batch (list): List of :class:`~pulpcore.plugin.stages.DeclarativeContent`.

        Returns:
            List: Of :class:`~pulpcore.plugin.models.RemoteArtifact`.
        """
        remotes_present = set()
        for d_content in batch:
            for d_artifact in d_content.d_artifacts:
                if d_artifact.remote:
                    remotes_present.add(d_artifact.remote)

        await sync_to_async(prefetch_related_objects)(
            [d_c.content for d_c in batch],
            Prefetch(
                "contentartifact_set",
                queryset=ContentArtifact.objects.prefetch_related(
                    Prefetch(
                        "remoteartifact_set",
                        queryset=RemoteArtifact.objects.filter(
                            remote__in=remotes_present),
                        to_attr="_remote_artifact_saver_ras",
                    )),
                to_attr="_remote_artifact_saver_cas",
            ),
        )

        # Now return the list of RemoteArtifacts that need to be saved.
        #
        # We can end up with duplicates (diff pks, same sha256) in the sequence below,
        # so we store by-sha256 and then return the final values
        ras_to_create = {}  # { str(<sha256>): RemoteArtifact, ... }
        ras_to_update = {}
        for d_content in batch:
            for d_artifact in d_content.d_artifacts:
                if not d_artifact.remote:
                    continue

                async for content_artifact in sync_to_async_iterable(
                        d_content.content._remote_artifact_saver_cas):
                    if d_artifact.relative_path == content_artifact.relative_path:
                        break
                else:
                    if self.fix_mismatched_remote_artifacts:
                        # We couldn't match an DeclarativeArtifact to a ContentArtifact by rel_path.
                        # If there are any paths available (i.e., other ContentArtifacts for this
                        # Artifact), complain to the logs, pick the rel_path from the last
                        # ContentArtifact we examined, and continue.
                        #
                        # If we can't find anything to choose from (can that even happen?), fail
                        # the process.
                        avail_paths = ",".join([
                            ca.relative_path for ca in
                            d_content.content._remote_artifact_saver_cas
                        ])
                        if avail_paths:
                            msg = _(
                                "No declared artifact with relative path '{rp}' for content '{c}'"
                                " from remote '{rname}'. Using last from available-paths : '{ap}'"
                            )
                            log.warning(
                                msg.format(
                                    rp=d_artifact.relative_path,
                                    c=d_content.content.natural_key(),
                                    rname=d_artifact.remote.name,
                                    ap=avail_paths,
                                ))
                            d_artifact.relative_path = content_artifact.relative_path
                        else:
                            msg = _(
                                "No declared artifact with relative path '{rp}' for content '{c}'"
                                " from remote '{rname}', and no paths available."
                            )
                            raise ValueError(
                                msg.format(
                                    rp=d_artifact.relative_path,
                                    c=d_content.content.natural_key(),
                                    rname=d_artifact.remote.name,
                                ))
                    else:
                        msg = _(
                            'No declared artifact with relative path "{rp}" for content "{c}"'
                        )
                        raise ValueError(
                            msg.format(rp=d_artifact.relative_path,
                                       c=d_content.content))

                async for remote_artifact in sync_to_async_iterable(
                        content_artifact._remote_artifact_saver_ras):
                    if d_artifact.url == remote_artifact.url:
                        break

                    if d_artifact.remote.pk == remote_artifact.remote_id:
                        key = f"{content_artifact.pk}-{remote_artifact.remote_id}"
                        remote_artifact.url = d_artifact.url
                        ras_to_update[key] = remote_artifact
                        break
                else:
                    remote_artifact = self._create_remote_artifact(
                        d_artifact, content_artifact)
                    key = f"{content_artifact.pk}-{d_artifact.remote.pk}"
                    ras_to_create[key] = remote_artifact

        # Make sure we create/update RemoteArtifacts in a stable order, to help
        # prevent deadlocks in high-concurrency environments. We can rely on the
        # Artifact sha256 for our ordering.
        if ras_to_create:
            ras_to_create_ordered = sorted(list(ras_to_create.values()),
                                           key=lambda x: x.sha256)
            await sync_to_async(RemoteArtifact.objects.bulk_create
                                )(ras_to_create_ordered)
        if ras_to_update:
            ras_to_update_ordered = sorted(list(ras_to_update.values()),
                                           key=lambda x: x.sha256)
            await sync_to_async(RemoteArtifact.objects.bulk_update
                                )(ras_to_update_ordered, fields=["url"])