Exemple #1
0
    async def _should_we_sync(self):
        """Check last synced metadata time."""
        msg = _("no-op: Checking if remote changed since last sync.")
        noop = ProgressReport(message=msg, code="noop")
        noop.state = TASK_STATES.COMPLETED
        noop.save()

        if not self.repository.remote:
            return True

        if self.remote != self.repository.remote.cast():
            return True

        root, api_version = await self._get_root_api(self.remote.url)
        if api_version == 3:
            downloader = self.remote.get_downloader(
                url=root, silence_errors_for_response_status_codes={404})
            try:
                metadata = parse_metadata(await downloader.run())
            except FileNotFoundError:
                return True

            try:
                self.last_synced_metadata_time = parse_datetime(
                    metadata["published"])
            except KeyError:
                return True

            sources = set()
            if self.collection_info:
                sources = {r.source for r in self.collection_info if r.source}
            sources.add(self.remote.url)
            if len(sources) > 1:
                return True

            if self.last_synced_metadata_time == self.repository.last_synced_metadata_time:
                noop.message = _(
                    "no-op: {remote} did not change since last sync - {published}"
                    .format(remote=self.remote.url,
                            published=self.last_synced_metadata_time))
                noop.save()
                return False

        return True
Exemple #2
0
    async def run(self):
        """
        ContainerFirstStage.
        """
        future_manifests = []
        tag_list = []
        to_download = []
        man_dcs = {}
        total_blobs = []

        with ProgressReport(
            message='Downloading tag list', code='downloading.tag_list', total=1
        ) as pb:
            repo_name = self.remote.namespaced_upstream_name
            relative_url = '/v2/{name}/tags/list'.format(name=repo_name)
            tag_list_url = urljoin(self.remote.url, relative_url)
            list_downloader = self.remote.get_downloader(url=tag_list_url)
            await list_downloader.run(extra_data={'repo_name': repo_name})

            with open(list_downloader.path) as tags_raw:
                tags_dict = json.loads(tags_raw.read())
                tag_list = tags_dict['tags']

            # check for the presence of the pagination link header
            link = list_downloader.response_headers.get('Link')
            await self.handle_pagination(link, repo_name, tag_list)
            tag_list = self.filter_tags(tag_list)
            pb.increment()

        for tag_name in tag_list:
            relative_url = '/v2/{name}/manifests/{tag}'.format(
                name=self.remote.namespaced_upstream_name,
                tag=tag_name,
            )
            url = urljoin(self.remote.url, relative_url)
            downloader = self.remote.get_downloader(url=url)
            to_download.append(downloader.run(extra_data={'headers': V2_ACCEPT_HEADERS}))

        pb_parsed_tags = ProgressReport(
            message='Processing Tags',
            code='processing.tag',
            state=TASK_STATES.RUNNING,
            total=len(tag_list)
        )

        for download_tag in asyncio.as_completed(to_download):
            tag = await download_tag
            with open(tag.path, 'rb') as content_file:
                raw_data = content_file.read()
            content_data = json.loads(raw_data)
            media_type = content_data.get('mediaType')
            tag.artifact_attributes['file'] = tag.path
            saved_artifact = Artifact(**tag.artifact_attributes)
            try:
                saved_artifact.save()
            except IntegrityError:
                del tag.artifact_attributes['file']
                saved_artifact = Artifact.objects.get(**tag.artifact_attributes)
            tag_dc = self.create_tag(saved_artifact, tag.url)

            if media_type in (MEDIA_TYPE.MANIFEST_LIST, MEDIA_TYPE.INDEX_OCI):
                list_dc = self.create_tagged_manifest_list(
                    tag_dc, content_data)
                await self.put(list_dc)
                tag_dc.extra_data['man_relation'] = list_dc
                for manifest_data in content_data.get('manifests'):
                    man_dc = self.create_manifest(list_dc, manifest_data)
                    future_manifests.append(man_dc)
                    man_dcs[man_dc.content.digest] = man_dc
                    await self.put(man_dc)
            else:
                man_dc = self.create_tagged_manifest(tag_dc, content_data, raw_data)
                await self.put(man_dc)
                tag_dc.extra_data['man_relation'] = man_dc
                self.handle_blobs(man_dc, content_data, total_blobs)
            await self.put(tag_dc)
            pb_parsed_tags.increment()

        pb_parsed_tags.state = 'completed'
        pb_parsed_tags.save()

        for manifest_future in future_manifests:
            man = await manifest_future.resolution()
            with man._artifacts.get().file.open() as content_file:
                raw = content_file.read()
            content_data = json.loads(raw)
            man_dc = man_dcs[man.digest]
            self.handle_blobs(man_dc, content_data, total_blobs)
        for blob in total_blobs:
            await self.put(blob)
async def pre_migrate_content(content_model, mutable_type, premigrate_hook):
    """
    A coroutine to pre-migrate Pulp 2 content, including all details for on_demand content.

    Args:
        content_model: Models for content which is being migrated.
        mutable_type: Boolean that indicates whether the content type is mutable.
    """
    batch_size = 1000
    content_type = content_model.pulp2.TYPE_ID
    pulp2content = []
    pulp2mutatedcontent = []

    # the latest timestamp we have in the migration tool Pulp2Content table for this content type
    content_qs = Pulp2Content.objects.filter(
        pulp2_content_type_id=content_type)
    last_updated = content_qs.aggregate(
        Max('pulp2_last_updated'))['pulp2_last_updated__max'] or 0
    _logger.debug(
        'The latest migrated {type} content has {timestamp} timestamp.'.format(
            type=content_type, timestamp=last_updated))

    if premigrate_hook:
        pulp2_content_ids = premigrate_hook()
        mongo_content_qs = content_model.pulp2.objects(
            _last_updated__gte=last_updated, id__in=pulp2_content_ids)
    else:
        # query only newly created/updated items
        mongo_content_qs = content_model.pulp2.objects(
            _last_updated__gte=last_updated)
    total_content = mongo_content_qs.count()
    _logger.debug('Total count for {type} content to migrate: {total}'.format(
        type=content_type, total=total_content))

    pulp2content_pb = ProgressReport(
        message='Pre-migrating Pulp 2 {} content (general info)'.format(
            content_type.upper()),
        code='premigrating.content.general',
        total=total_content,
        state=TASK_STATES.RUNNING)
    pulp2content_pb.save()
    pulp2detail_pb = ProgressReport(
        message='Pre-migrating Pulp 2 {} content (detail info)'.format(
            content_type.upper()),
        code='premigrating.content.detail',
        total=total_content,
        state=TASK_STATES.RUNNING)
    pulp2detail_pb.save()
    existing_count = 0
    fields = set(['id', '_storage_path', '_last_updated', '_content_type_id'])
    if hasattr(content_model.pulp2, 'downloaded'):
        fields.add('downloaded')
    for i, record in enumerate(
            mongo_content_qs.only(*fields).batch_size(batch_size)):
        if record._last_updated == last_updated:
            # corner case - content with the last``last_updated`` date might be pre-migrated;
            # check if this content is already pre-migrated
            migrated = Pulp2Content.objects.filter(
                pulp2_last_updated=last_updated, pulp2_id=record.id)
            if migrated:
                existing_count += 1

                # it has to be updated here and not later, in case all items were migrated before
                # and no new content will be saved.
                pulp2content_pb.total -= 1
                pulp2content_pb.save()
                pulp2detail_pb.total -= 1
                pulp2detail_pb.save()
        else:
            if mutable_type:
                # This is a mutable content type. Query for the existing pulp2content.
                # If one was found, it means that the migrated content is older than the incoming.
                # Detele outdated migrated pulp2content and create a new pulp2content
                try:
                    outdated = Pulp2Content.objects.get(pulp2_id=record.id)
                except Pulp2Content.DoesNotExist:
                    pass
                else:
                    pulp2mutatedcontent.append(outdated.pulp2_id)
                    outdated.delete()

            downloaded = record.downloaded if hasattr(record,
                                                      'downloaded') else False
            item = Pulp2Content(pulp2_id=record.id,
                                pulp2_content_type_id=record._content_type_id,
                                pulp2_last_updated=record._last_updated,
                                pulp2_storage_path=record._storage_path,
                                downloaded=downloaded)
            _logger.debug(
                'Add content item to the list to migrate: {item}'.format(
                    item=item))
            pulp2content.append(item)

        # determine if the batch needs to be saved, also take into account whether there is
        # anything in the pulp2contant to be saved
        save_batch = pulp2content and (i and not (i + 1) % batch_size
                                       or i == total_content - 1)
        if save_batch:
            _logger.debug(
                'Bulk save for generic content info, saved so far: {index}'.
                format(index=i + 1))
            pulp2content_batch = Pulp2Content.objects.bulk_create(
                pulp2content, ignore_conflicts=True)
            content_saved = len(pulp2content_batch) - existing_count
            pulp2content_pb.done += content_saved
            pulp2content_pb.save()

            await content_model.pulp_2to3_detail.pre_migrate_content_detail(
                pulp2content_batch)

            pulp2detail_pb.done += content_saved
            pulp2detail_pb.save()

            pulp2content = []
            existing_count = 0
    if pulp2mutatedcontent:
        # when we flip the is_migrated flag to False, we base this decision on the last_unit_added
        # https://github.com/pulp/pulp-2to3-migration/blob/master/pulp_2to3_migration/app/pre_migration.py#L279  # noqa
        # in this case, we still need to update the is_migrated flag manually because of errata.
        # in pulp2 sync and copy cases of updated errata are not covered
        # only when uploading errata last_unit_added is updated on all the repos that contain it
        mutated_content = Pulp2RepoContent.objects.filter(
            pulp2_unit_id__in=pulp2mutatedcontent)
        repo_to_update_ids = set(
            mutated_content.values_list('pulp2_repository_id', flat=True))
        repos_to_update = []
        for pulp2repo in Pulp2Repository.objects.filter(
                pk__in=repo_to_update_ids):
            pulp2repo.is_migrated = False
            repos_to_update.append(pulp2repo)

        Pulp2Repository.objects.bulk_update(objs=repos_to_update,
                                            fields=['is_migrated'],
                                            batch_size=1000)

    await pre_migrate_lazycatalog(content_type)

    pulp2content_pb.state = TASK_STATES.COMPLETED
    pulp2content_pb.save()
    pulp2detail_pb.state = TASK_STATES.COMPLETED
    pulp2detail_pb.save()
Exemple #4
0
class CollectionSyncFirstStage(Stage):
    """
    The first stage of a pulp_ansible sync pipeline.
    """
    def __init__(self, remote, repository, optimize):
        """
        The first stage of a pulp_ansible sync pipeline.

        Args:
            remote (CollectionRemote): The remote data to be used when syncing
            repository (AnsibleRepository): The repository being syncedself.
            optimize (boolean): Whether to optimize sync or not.

        """
        super().__init__()
        msg = _("Parsing CollectionVersion Metadata")
        self.parsing_metadata_progress_bar = ProgressReport(
            message=msg, code="parsing.metadata")
        self.remote = remote
        self.repository = repository
        self.optimize = optimize
        self.collection_info = parse_collections_requirements_file(
            remote.requirements_file)
        self.deprecations = Q()
        self.add_dependents = self.collection_info and self.remote.sync_dependencies
        self.already_synced = set()
        self._unpaginated_collection_metadata = None
        self._unpaginated_collection_version_metadata = None
        self.last_synced_metadata_time = None

        # Interpret download policy
        self.deferred_download = self.remote.policy != Remote.IMMEDIATE

    @alru_cache(maxsize=128)
    async def _get_root_api(self, root):
        """
        Returns the root api path and api version.

        Based on https://git.io/JTMxE.
        """
        if root == "https://galaxy.ansible.com" or root == "https://galaxy.ansible.com/":
            root = "https://galaxy.ansible.com/api/"

        downloader = self.remote.get_downloader(url=root)

        try:
            api_data = parse_metadata(await downloader.run())
        except (json.decoder.JSONDecodeError, ClientResponseError):
            if root.endswith("/api/"):
                raise

            root = urljoin(root, "api/")
            downloader = self.remote.get_downloader(url=root)
            api_data = parse_metadata(await downloader.run())

        if "available_versions" not in api_data:
            raise RuntimeError(
                _("Could not find 'available_versions' at {}").format(root))

        if "v3" in api_data.get("available_versions", {}):
            api_version = 3
        elif "v2" in api_data.get("available_versions", {}):
            api_version = 2
        else:
            raise RuntimeError(
                _("Unsupported API versions at {}").format(root))

        endpoint = f"{root}v{api_version}"

        return endpoint, api_version

    @alru_cache(maxsize=128)
    async def _get_paginated_collection_api(self, root):
        """
        Returns the collection api path and api version.

        Based on https://git.io/JTMxE.
        """
        endpoint, api_version = await self._get_root_api(root)
        return f"{endpoint}/collections/", api_version

    async def _fetch_collection_version_metadata(self, api_version,
                                                 collection_version_url):
        downloader = self.remote.get_downloader(url=collection_version_url)
        metadata = parse_metadata(await downloader.run())
        await self._add_collection_version(api_version, collection_version_url,
                                           metadata)

    async def _add_collection_version(self, api_version,
                                      collection_version_url, metadata):
        """Add CollectionVersion to the sync pipeline."""
        url = metadata["download_url"]
        collection_version = CollectionVersion(
            namespace=metadata["namespace"]["name"],
            name=metadata["collection"]["name"],
            version=metadata["version"],
        )
        cv_unique = attrgetter("namespace", "name",
                               "version")(collection_version)
        if cv_unique in self.already_synced:
            return
        self.already_synced.add(cv_unique)

        info = metadata["metadata"]

        if self.add_dependents:
            dependencies = info["dependencies"]
            tasks = []
            loop = asyncio.get_event_loop()
            for full_name, version in dependencies.items():
                namespace, name = full_name.split(".")
                if not (namespace, name, version) in self.already_synced:
                    new_req = RequirementsFileEntry(
                        name=full_name,
                        version=version,
                        source=None,
                    )
                    tasks.append(
                        loop.create_task(
                            self._fetch_collection_metadata(new_req)))
            await asyncio.gather(*tasks)

        info.pop("tags")
        for attr_name, attr_value in info.items():
            if attr_value is None or attr_name not in collection_version.__dict__:
                continue
            setattr(collection_version, attr_name, attr_value)

        artifact = metadata["artifact"]

        d_artifact = DeclarativeArtifact(
            artifact=Artifact(sha256=artifact["sha256"],
                              size=artifact["size"]),
            url=url,
            relative_path=collection_version.relative_path,
            remote=self.remote,
            deferred_download=self.deferred_download,
        )

        extra_data = {}
        if api_version != 2:  # V2 never implemented the docs-blob requests
            extra_data["docs_blob_url"] = f"{collection_version_url}docs-blob/"

        d_content = DeclarativeContent(
            content=collection_version,
            d_artifacts=[d_artifact],
            extra_data=extra_data,
        )
        self.parsing_metadata_progress_bar.increment()
        await self.put(d_content)

    def _collection_versions_list_downloader(self, api_version,
                                             collection_endpoint, namespace,
                                             name, page_num, page_size):
        url_without_get_params = f"{collection_endpoint}{namespace}/{name}/versions/"
        if api_version == 2:
            versions_list_url = f"{url_without_get_params}?page={page_num}&page_size={page_size}"
        else:
            offset = (page_num - 1) * page_size
            versions_list_url = f"{url_without_get_params}?limit={page_size}&offset={offset}"
        return self.remote.get_downloader(url=versions_list_url)

    async def _fetch_paginated_collection_metadata(self,
                                                   name,
                                                   namespace,
                                                   requirement,
                                                   source=None):
        root = source or self.remote.url
        collection_endpoint, api_version = await self._get_paginated_collection_api(
            root)
        collection_url = f"{collection_endpoint}{namespace}/{name}"
        collection_metadata_downloader = self.remote.get_downloader(
            url=collection_url)
        collection_metadata = parse_metadata(
            await collection_metadata_downloader.run())
        loop = asyncio.get_event_loop()

        tasks = []
        page_num = 1
        while True:
            versions_list_downloader = self._collection_versions_list_downloader(
                api_version, collection_endpoint, namespace, name, page_num,
                PAGE_SIZE)
            collection_versions_list = parse_metadata(
                await versions_list_downloader.run())
            if api_version == 2:
                collection_versions = collection_versions_list["results"]
            else:
                collection_versions = collection_versions_list["data"]
            for collection_version in collection_versions:
                if collection_version["version"] in requirement:
                    version_num = collection_version["version"]
                    collection_version_detail_url = f"{collection_url}/versions/{version_num}/"
                    if collection_metadata["deprecated"]:
                        self.deprecations |= Q(namespace=namespace, name=name)
                    tasks.append(
                        loop.create_task(
                            self._fetch_collection_version_metadata(
                                api_version,
                                collection_version_detail_url,
                            )))
            next_value = self._get_response_next_value(
                api_version, collection_versions_list)
            if not next_value:
                break
            page_num = page_num + 1

        await asyncio.gather(*tasks)

    async def _read_from_downloaded_metadata(self, name, namespace,
                                             requirement):
        tasks = []
        loop = asyncio.get_event_loop()

        if self._unpaginated_collection_metadata[namespace][name][
                "deprecated"]:
            self.deprecations |= Q(namespace=namespace, name=name)

        all_versions_of_collection = self._unpaginated_collection_version_metadata[
            namespace][name]

        for col_version_metadata in all_versions_of_collection:
            if col_version_metadata["version"] in requirement:
                collection_version_url = urljoin(
                    self.remote.url, f"{col_version_metadata['href']}")
                tasks.append(
                    loop.create_task(
                        self._add_collection_version(self._api_version,
                                                     collection_version_url,
                                                     col_version_metadata)))
        await asyncio.gather(*tasks)

    async def _fetch_collection_metadata(self, requirements_entry):
        if requirements_entry.version == "*":
            requirement_version = Requirement.parse("collection")
        else:
            requirement_version = Requirement.parse(
                f"collection{requirements_entry.version}")

        namespace, name = requirements_entry.name.split(".")

        if self._unpaginated_collection_version_metadata and requirements_entry.source is None:
            await self._read_from_downloaded_metadata(name, namespace,
                                                      requirement_version)
        else:
            await self._fetch_paginated_collection_metadata(
                name, namespace, requirement_version,
                requirements_entry.source)

    @staticmethod
    def _get_response_next_value(api_version, response):
        if api_version == 2:
            return response["next"]
        else:
            return response["links"]["next"]

    def _collection_list_downloader(self, api_version, collection_endpoint,
                                    page_num, page_size):
        if api_version == 2:
            collection_list_url = f"{collection_endpoint}?page={page_num}&page_size={page_size}"
        else:
            offset = (page_num - 1) * page_size
            collection_list_url = f"{collection_endpoint}?limit={page_size}&offset={offset}"
        return self.remote.get_downloader(url=collection_list_url)

    async def _download_unpaginated_metadata(self):
        root_endpoint, api_version = await self._get_root_api(self.remote.url)
        self._api_version = api_version
        if api_version > 2:
            collection_endpoint = f"{root_endpoint}/collections/all/"
            downloader = self.remote.get_downloader(
                url=collection_endpoint,
                silence_errors_for_response_status_codes={404})
            try:
                collection_metadata_list = parse_metadata(await
                                                          downloader.run())
            except FileNotFoundError:
                pass
            else:
                self._unpaginated_collection_metadata = defaultdict(dict)
                for collection in collection_metadata_list:
                    namespace = collection["namespace"]
                    name = collection["name"]
                    self._unpaginated_collection_metadata[namespace][
                        name] = collection

                collection_version_endpoint = f"{root_endpoint}/collection_versions/all/"
                downloader = self.remote.get_downloader(
                    url=collection_version_endpoint)
                collection_version_metadata_list = parse_metadata(
                    await downloader.run())

                self._unpaginated_collection_version_metadata = defaultdict(
                    lambda: defaultdict(list))
                for collection_version_metadata in collection_version_metadata_list:
                    namespace = collection_version_metadata["namespace"][
                        "name"]
                    name = collection_version_metadata["name"]
                    self._unpaginated_collection_version_metadata[namespace][
                        name].append(collection_version_metadata)

    async def _find_all_collections_from_unpaginated_data(self):
        tasks = []
        loop = asyncio.get_event_loop()

        for collection_namespace_dict in self._unpaginated_collection_metadata.values(
        ):
            for collection in collection_namespace_dict.values():
                if collection["deprecated"]:
                    self.deprecations |= Q(namespace=collection["namespace"],
                                           name=collection["name"])

        for collections_in_namespace in self._unpaginated_collection_version_metadata.values(
        ):
            for collection_versions in collections_in_namespace.values():
                for collection_version in collection_versions:
                    collection_version_url = urljoin(
                        self.remote.url, f"{collection_version['href']}")
                    tasks.append(
                        loop.create_task(
                            self._add_collection_version(
                                self._api_version, collection_version_url,
                                collection_version)))

        await asyncio.gather(*tasks)

    async def _find_all_collections(self):
        if self._unpaginated_collection_version_metadata:
            await self._find_all_collections_from_unpaginated_data()
            return

        collection_endpoint, api_version = await self._get_paginated_collection_api(
            self.remote.url)
        loop = asyncio.get_event_loop()

        tasks = []
        page_num = 1
        while True:
            collection_list_downloader = self._collection_list_downloader(
                api_version, collection_endpoint, page_num, PAGE_SIZE)
            collection_list = parse_metadata(await
                                             collection_list_downloader.run())

            if api_version == 2:
                collections = collection_list["results"]
            else:
                collections = collection_list["data"]

            for collection in collections:
                if api_version == 2:
                    namespace = collection["namespace"]["name"]
                else:
                    namespace = collection["namespace"]
                name = collection["name"]
                requirements_file = RequirementsFileEntry(
                    name=".".join([namespace, name]),
                    version="*",
                    source=None,
                )
                tasks.append(
                    loop.create_task(
                        self._fetch_collection_metadata(requirements_file)))

            next_value = self._get_response_next_value(api_version,
                                                       collection_list)
            if not next_value:
                break
            page_num = page_num + 1

        await asyncio.gather(*tasks)

    async def _should_we_sync(self):
        """Check last synced metadata time."""
        msg = _("no-op: Checking if remote changed since last sync.")
        noop = ProgressReport(message=msg, code="noop")
        noop.state = TASK_STATES.COMPLETED
        noop.save()

        if not self.repository.remote:
            return True

        if self.remote != self.repository.remote.cast():
            return True

        root, api_version = await self._get_root_api(self.remote.url)
        if api_version == 3:
            downloader = self.remote.get_downloader(
                url=root, silence_errors_for_response_status_codes={404})
            try:
                metadata = parse_metadata(await downloader.run())
            except FileNotFoundError:
                return True

            try:
                self.last_synced_metadata_time = parse_datetime(
                    metadata["published"])
            except KeyError:
                return True

            sources = set()
            if self.collection_info:
                sources = {r.source for r in self.collection_info if r.source}
            sources.add(self.remote.url)
            if len(sources) > 1:
                return True

            if self.last_synced_metadata_time == self.repository.last_synced_metadata_time:
                noop.message = _(
                    "no-op: {remote} did not change since last sync - {published}"
                    .format(remote=self.remote.url,
                            published=self.last_synced_metadata_time))
                noop.save()
                return False

        return True

    async def run(self):
        """
        Build and emit `DeclarativeContent` from the ansible metadata.
        """
        if self.optimize:
            should_we_sync = await self._should_we_sync()
            if should_we_sync is False:
                log.debug(_("no-op: remote wasn't updated since last sync."))
                return

        tasks = []
        loop = asyncio.get_event_loop()

        await self._download_unpaginated_metadata()

        if self.collection_info:
            for requirement_entry in self.collection_info:
                tasks.append(
                    loop.create_task(
                        self._fetch_collection_metadata(requirement_entry)))
        else:
            tasks.append(loop.create_task(self._find_all_collections()))
        await asyncio.gather(*tasks)
        self.parsing_metadata_progress_bar.state = TASK_STATES.COMPLETED
        self.parsing_metadata_progress_bar.save()
    async def run(self):
        """
        Build `DeclarativeContent` from the repodata.
        """
        packages_pb = ProgressReport(message='Parsed Packages',
                                     code='parsing.packages')
        errata_pb = ProgressReport(message='Parsed Erratum',
                                   code='parsing.errata')
        modulemd_pb = ProgressReport(message='Parse Modulemd',
                                     code='parsing.modulemds')
        modulemd_defaults_pb = ProgressReport(
            message='Parse Modulemd-defaults', code='parsing.modulemddefaults')
        comps_pb = ProgressReport(message='Parsed Comps', code='parsing.comps')

        packages_pb.save()
        errata_pb.save()
        comps_pb.save()

        remote_url = self.new_url or self.remote.url
        remote_url = remote_url if remote_url[-1] == "/" else f"{remote_url}/"

        progress_data = dict(message='Downloading Metadata Files',
                             code='downloading.metadata')
        with ProgressReport(**progress_data) as metadata_pb:
            downloader = self.remote.get_downloader(
                url=urljoin(remote_url, 'repodata/repomd.xml'))
            # TODO: decide how to distinguish between a mirror list and a normal repo
            result = await downloader.run()
            metadata_pb.increment()

            if self.kickstart:
                d_artifacts = []
                for path, checksum in self.kickstart["download"][
                        "images"].items():
                    artifact = Artifact(**checksum)

                    da = DeclarativeArtifact(
                        artifact=artifact,
                        url=urljoin(remote_url, path),
                        relative_path=path,
                        remote=self.remote,
                        deferred_download=self.deferred_download)

                    d_artifacts.append(da)

                distribution_tree = DistributionTree(
                    **self.kickstart["distribution_tree"])
                dc = DeclarativeContent(content=distribution_tree,
                                        d_artifacts=d_artifacts)
                dc.extra_data = self.kickstart
                await self.put(dc)

            repomd_path = result.path
            repomd = cr.Repomd(repomd_path)
            package_repodata_urls = {}
            downloaders = []
            modulemd_list = list()
            dc_groups = []
            dc_categories = []
            dc_environments = []
            nevra_to_module = defaultdict(dict)
            pkgname_to_groups = defaultdict(list)
            group_to_categories = defaultdict(list)
            group_to_environments = defaultdict(list)
            optionalgroup_to_environments = defaultdict(list)
            modulemd_results = None
            comps_downloader = None

            for record in repomd.records:
                if record.type in PACKAGE_REPODATA:
                    package_repodata_urls[record.type] = urljoin(
                        remote_url, record.location_href)
                elif record.type in UPDATE_REPODATA:
                    updateinfo_url = urljoin(remote_url, record.location_href)
                    downloader = self.remote.get_downloader(url=updateinfo_url)
                    downloaders.append([downloader.run()])

                elif record.type in COMPS_REPODATA:
                    comps_url = urljoin(remote_url, record.location_href)
                    comps_downloader = self.remote.get_downloader(
                        url=comps_url)

                elif record.type in SKIP_REPODATA:
                    continue

                elif record.type in MODULAR_REPODATA:
                    modules_url = urljoin(remote_url, record.location_href)
                    modulemd_downloader = self.remote.get_downloader(
                        url=modules_url)
                    modulemd_results = await modulemd_downloader.run()

                elif record.type not in PACKAGE_DB_REPODATA:
                    file_data = {
                        record.checksum_type: record.checksum,
                        "size": record.size
                    }
                    da = DeclarativeArtifact(
                        artifact=Artifact(**file_data),
                        url=urljoin(remote_url, record.location_href),
                        relative_path=record.location_href,
                        remote=self.remote,
                        deferred_download=False)
                    repo_metadata_file = RepoMetadataFile(
                        data_type=record.type,
                        checksum_type=record.checksum_type,
                        checksum=record.checksum,
                    )
                    dc = DeclarativeContent(content=repo_metadata_file,
                                            d_artifacts=[da])
                    await self.put(dc)

            # we have to sync module.yaml first if it exists, to make relations to packages
            if modulemd_results:
                modulemd_index = mmdlib.ModuleIndex.new()
                open_func = gzip.open if modulemd_results.url.endswith(
                    '.gz') else open
                with open_func(modulemd_results.path, 'r') as moduleyaml:
                    modulemd_index.update_from_string(
                        moduleyaml.read().decode(), True)

                modulemd_names = modulemd_index.get_module_names() or []
                modulemd_all = parse_modulemd(modulemd_names, modulemd_index)

                modulemd_pb.total = len(modulemd_all)
                modulemd_pb.state = 'running'
                modulemd_pb.save()

                for modulemd in modulemd_all:
                    artifact = modulemd.pop('artifact')
                    relative_path = '{}{}{}{}{}snippet'.format(
                        modulemd[PULP_MODULE_ATTR.NAME],
                        modulemd[PULP_MODULE_ATTR.STREAM],
                        modulemd[PULP_MODULE_ATTR.VERSION],
                        modulemd[PULP_MODULE_ATTR.CONTEXT],
                        modulemd[PULP_MODULE_ATTR.ARCH])
                    da = DeclarativeArtifact(artifact=artifact,
                                             relative_path=relative_path,
                                             url=modules_url)
                    modulemd_content = Modulemd(**modulemd)
                    dc = DeclarativeContent(content=modulemd_content,
                                            d_artifacts=[da])
                    dc.extra_data = defaultdict(list)

                    # dc.content.artifacts are Modulemd artifacts
                    for artifact in json.loads(dc.content.artifacts):
                        nevra_to_module.setdefault(artifact, set()).add(dc)
                    modulemd_list.append(dc)

                modulemd_default_names = parse_defaults(modulemd_index)

                modulemd_defaults_pb.total = len(modulemd_default_names)
                modulemd_defaults_pb.state = 'running'
                modulemd_defaults_pb.save()

                for default in modulemd_default_names:
                    artifact = default.pop('artifact')
                    relative_path = '{}{}snippet'.format(
                        default[PULP_MODULEDEFAULTS_ATTR.MODULE],
                        default[PULP_MODULEDEFAULTS_ATTR.STREAM])
                    da = DeclarativeArtifact(artifact=artifact,
                                             relative_path=relative_path,
                                             url=modules_url)
                    default_content = ModulemdDefaults(**default)
                    modulemd_defaults_pb.increment()
                    dc = DeclarativeContent(content=default_content,
                                            d_artifacts=[da])
                    await self.put(dc)

            if comps_downloader:
                comps_result = await comps_downloader.run()

                comps = libcomps.Comps()
                comps.fromxml_f(comps_result.path)

                comps_pb.total = (len(comps.groups) + len(comps.categories) +
                                  len(comps.environments))
                comps_pb.state = 'running'
                comps_pb.save()

                if comps.langpacks:
                    langpack_dict = PackageLangpacks.libcomps_to_dict(
                        comps.langpacks)
                    packagelangpack = PackageLangpacks(
                        matches=strdict_to_dict(comps.langpacks),
                        digest=dict_digest(langpack_dict))
                    dc = DeclarativeContent(content=packagelangpack)
                    dc.extra_data = defaultdict(list)
                    await self.put(dc)

                if comps.categories:
                    for category in comps.categories:
                        category_dict = PackageCategory.libcomps_to_dict(
                            category)
                        category_dict['digest'] = dict_digest(category_dict)
                        packagecategory = PackageCategory(**category_dict)
                        dc = DeclarativeContent(content=packagecategory)
                        dc.extra_data = defaultdict(list)

                        if packagecategory.group_ids:
                            for group_id in packagecategory.group_ids:
                                group_to_categories[group_id['name']].append(
                                    dc)
                        dc_categories.append(dc)

                if comps.environments:
                    for environment in comps.environments:
                        environment_dict = PackageEnvironment.libcomps_to_dict(
                            environment)
                        environment_dict['digest'] = dict_digest(
                            environment_dict)
                        packageenvironment = PackageEnvironment(
                            **environment_dict)
                        dc = DeclarativeContent(content=packageenvironment)
                        dc.extra_data = defaultdict(list)

                        if packageenvironment.option_ids:
                            for option_id in packageenvironment.option_ids:
                                optionalgroup_to_environments[
                                    option_id['name']].append(dc)

                        if packageenvironment.group_ids:
                            for group_id in packageenvironment.group_ids:
                                group_to_environments[group_id['name']].append(
                                    dc)

                        dc_environments.append(dc)

                if comps.groups:
                    for group in comps.groups:
                        group_dict = PackageGroup.libcomps_to_dict(group)
                        group_dict['digest'] = dict_digest(group_dict)
                        packagegroup = PackageGroup(**group_dict)
                        dc = DeclarativeContent(content=packagegroup)
                        dc.extra_data = defaultdict(list)

                        if packagegroup.packages:
                            for package in packagegroup.packages:
                                pkgname_to_groups[package['name']].append(dc)

                        if dc.content.id in group_to_categories.keys():
                            for dc_category in group_to_categories[
                                    dc.content.id]:
                                dc.extra_data['category_relations'].append(
                                    dc_category)
                                dc_category.extra_data['packagegroups'].append(
                                    dc)

                        if dc.content.id in group_to_environments.keys():
                            for dc_environment in group_to_environments[
                                    dc.content.id]:
                                dc.extra_data['environment_relations'].append(
                                    dc_environment)
                                dc_environment.extra_data[
                                    'packagegroups'].append(dc)

                        if dc.content.id in optionalgroup_to_environments.keys(
                        ):
                            for dc_environment in optionalgroup_to_environments[
                                    dc.content.id]:
                                dc.extra_data['env_relations_optional'].append(
                                    dc_environment)
                                dc_environment.extra_data[
                                    'optionalgroups'].append(dc)

                        dc_groups.append(dc)

                for dc_category in dc_categories:
                    comps_pb.increment()
                    await self.put(dc_category)

                for dc_environment in dc_environments:
                    comps_pb.increment()
                    await self.put(dc_environment)

            # to preserve order, downloaders are created after all repodata urls are identified
            package_repodata_downloaders = []
            for repodata_type in PACKAGE_REPODATA:
                downloader = self.remote.get_downloader(
                    url=package_repodata_urls[repodata_type])
                package_repodata_downloaders.append(downloader.run())

            downloaders.append(package_repodata_downloaders)

            # asyncio.gather is used to preserve the order of results for package repodata
            pending = [
                asyncio.gather(*downloaders_group)
                for downloaders_group in downloaders
            ]

            while pending:
                done, pending = await asyncio.wait(
                    pending, return_when=asyncio.FIRST_COMPLETED)
                for downloader in done:
                    results = downloader.result()
                    if results[0].url == package_repodata_urls['primary']:
                        primary_xml_path = results[0].path
                        filelists_xml_path = results[1].path
                        other_xml_path = results[2].path
                        metadata_pb.done += 3
                        metadata_pb.save()

                        packages = await RpmFirstStage.parse_repodata(
                            primary_xml_path, filelists_xml_path,
                            other_xml_path)
                        packages_pb.total = len(packages)
                        packages_pb.state = 'running'
                        packages_pb.save()

                        for pkg in packages.values():
                            package = Package(
                                **Package.createrepo_to_dict(pkg))
                            artifact = Artifact(size=package.size_package)
                            checksum_type = getattr(
                                CHECKSUM_TYPES, package.checksum_type.upper())
                            setattr(artifact, checksum_type, package.pkgId)
                            url = urljoin(remote_url, package.location_href)
                            filename = os.path.basename(package.location_href)
                            da = DeclarativeArtifact(
                                artifact=artifact,
                                url=url,
                                relative_path=filename,
                                remote=self.remote,
                                deferred_download=self.deferred_download)
                            dc = DeclarativeContent(content=package,
                                                    d_artifacts=[da])
                            dc.extra_data = defaultdict(list)

                            # find if a package relates to a modulemd
                            if dc.content.nevra in nevra_to_module.keys():
                                dc.content.is_modular = True
                                for dc_modulemd in nevra_to_module[
                                        dc.content.nevra]:
                                    dc.extra_data['modulemd_relation'].append(
                                        dc_modulemd)
                                    dc_modulemd.extra_data[
                                        'package_relation'].append(dc)

                            if dc.content.name in pkgname_to_groups.keys():
                                for dc_group in pkgname_to_groups[
                                        dc.content.name]:
                                    dc.extra_data['group_relations'].append(
                                        dc_group)
                                    dc_group.extra_data[
                                        'related_packages'].append(dc)

                            packages_pb.increment()
                            await self.put(dc)

                    elif results[0].url == updateinfo_url:
                        updateinfo_xml_path = results[0].path
                        metadata_pb.increment()

                        updates = await RpmFirstStage.parse_updateinfo(
                            updateinfo_xml_path)

                        errata_pb.total = len(updates)
                        errata_pb.state = 'running'
                        errata_pb.save()

                        for update in updates:
                            update_record = UpdateRecord(
                                **UpdateRecord.createrepo_to_dict(update))
                            update_record.digest = RpmFirstStage.hash_update_record(
                                update)
                            future_relations = {
                                'collections': defaultdict(list),
                                'references': []
                            }

                            for collection in update.collections:
                                coll_dict = UpdateCollection.createrepo_to_dict(
                                    collection)
                                coll = UpdateCollection(**coll_dict)

                                for package in collection.packages:
                                    pkg_dict = UpdateCollectionPackage.createrepo_to_dict(
                                        package)
                                    pkg = UpdateCollectionPackage(**pkg_dict)
                                    future_relations['collections'][
                                        coll].append(pkg)

                            for reference in update.references:
                                reference_dict = UpdateReference.createrepo_to_dict(
                                    reference)
                                ref = UpdateReference(**reference_dict)
                                future_relations['references'].append(ref)

                            errata_pb.increment()
                            dc = DeclarativeContent(content=update_record)
                            dc.extra_data = future_relations
                            await self.put(dc)

            # now send modules down the pipeline since all relations have been set up
            for modulemd in modulemd_list:
                modulemd_pb.increment()
                await self.put(modulemd)

            for dc_group in dc_groups:
                comps_pb.increment()
                await self.put(dc_group)

        packages_pb.state = 'completed'
        errata_pb.state = 'completed'
        modulemd_pb.state = 'completed'
        modulemd_defaults_pb.state = 'completed'
        comps_pb.state = 'completed'
        packages_pb.save()
        errata_pb.save()
        modulemd_pb.save()
        modulemd_defaults_pb.save()
        comps_pb.save()
Exemple #6
0
async def pre_migrate_content(content_model):
    """
    A coroutine to pre-migrate Pulp 2 content.

    Args:
        content_model: Models for content which is being migrated.
    """
    batch_size = 10000
    content_type = content_model.pulp2.type
    pulp2content = []

    # the latest timestamp we have in the migration tool Pulp2Content table for this content type
    content_qs = Pulp2Content.objects.filter(
        pulp2_content_type_id=content_type)
    last_updated = content_qs.aggregate(
        Max('pulp2_last_updated'))['pulp2_last_updated__max'] or 0
    _logger.debug(
        'The latest migrated {type} content has {timestamp} timestamp.'.format(
            type=content_type, timestamp=last_updated))

    # query only newly created/updated items
    mongo_content_qs = content_model.pulp2.objects(
        _last_updated__gte=last_updated)
    total_content = mongo_content_qs.count()
    _logger.debug('Total count for {type} content to migrate: {total}'.format(
        type=content_type, total=total_content))

    pulp2content_pb = ProgressReport(
        message='Pre-migrating Pulp 2 {} content (general info)'.format(
            content_type.upper()),
        code='premigrating.content.general',
        total=total_content,
        state=TASK_STATES.RUNNING)
    pulp2content_pb.save()
    pulp2detail_pb = ProgressReport(
        message='Pre-migrating Pulp 2 {} content (detail info)'.format(
            content_type.upper()),
        code='premigrating.content.detail',
        total=total_content,
        state=TASK_STATES.RUNNING)
    pulp2detail_pb.save()
    existing_count = 0
    fields = set(['id', '_storage_path', '_last_updated', '_content_type_id'])
    if hasattr(content_model.pulp2, 'downloaded'):
        fields.add('downloaded')
    for i, record in enumerate(
            mongo_content_qs.only(*fields).batch_size(batch_size)):
        if record._last_updated == last_updated:
            # corner case - content with the last``last_updated`` date might be pre-migrated;
            # check if this content is already pre-migrated
            migrated = Pulp2Content.objects.filter(
                pulp2_last_updated=last_updated, pulp2_id=record.id)
            if migrated:
                existing_count += 1

                # it has to be updated here and not later, in case all items were migrated before
                # and no new content will be saved.
                pulp2content_pb.total -= 1
                pulp2content_pb.save()
                pulp2detail_pb.total -= 1
                pulp2detail_pb.save()
                continue

        downloaded = record.downloaded if hasattr(record,
                                                  'downloaded') else False
        item = Pulp2Content(pulp2_id=record.id,
                            pulp2_content_type_id=record._content_type_id,
                            pulp2_last_updated=record._last_updated,
                            pulp2_storage_path=record._storage_path,
                            downloaded=downloaded)
        _logger.debug('Add content item to the list to migrate: {item}'.format(
            item=item))
        pulp2content.append(item)

        save_batch = (i and not (i + 1) % batch_size or i == total_content - 1)
        if save_batch:
            _logger.debug(
                'Bulk save for generic content info, saved so far: {index}'.
                format(index=i + 1))
            pulp2content_batch = Pulp2Content.objects.bulk_create(
                pulp2content, ignore_conflicts=True)
            content_saved = len(pulp2content_batch) - existing_count
            pulp2content_pb.done += content_saved
            pulp2content_pb.save()

            await content_model.pulp_2to3_detail.pre_migrate_content_detail(
                pulp2content_batch)

            pulp2detail_pb.done += content_saved
            pulp2detail_pb.save()

            pulp2content = []
            existing_count = 0

    pulp2content_pb.state = TASK_STATES.COMPLETED
    pulp2content_pb.save()
    pulp2detail_pb.state = TASK_STATES.COMPLETED
    pulp2detail_pb.save()
def pre_migrate_content_type(content_model, mutable_type, lazy_type, premigrate_hook):
    """
    A coroutine to pre-migrate Pulp 2 content, including all details for on_demand content.

    Args:
        content_model: Models for content which is being migrated.
        mutable_type: Boolean that indicates whether the content type is mutable.
    """
    def delete_removed_pulp2_content(content_model):
        """
        Delete Pulp2Content records for content which is no longer present in Pulp2.

        This is to avoid situations and extra work when not all content migrated during the first
        migration run, then orphan clean up is run in Pulp 2, and then migration is run again.

        Args:
            content_model: Pulp 2 content model

        """
        content_type = content_model.pulp2.TYPE_ID
        mongo_content_qs = content_model.pulp2.objects().only('id')
        mongo_content_ids = {c['_id'] for c in mongo_content_qs.as_pymongo().no_cache()}
        premigrated_content_ids = set(
            Pulp2Content.objects.filter(
                pulp2_content_type_id=content_type
            ).only('pulp2_id').values_list('pulp2_id', flat=True)
        )
        content_ids_to_delete = premigrated_content_ids - mongo_content_ids
        if content_ids_to_delete:
            Pulp2Content.objects.filter(
                pulp2_content_type_id=content_type,
                pulp2_id__in=content_ids_to_delete
            ).delete()

    batch_size = settings.CONTENT_PREMIGRATION_BATCH_SIZE or DEFAULT_BATCH_SIZE
    pulp2content = []
    pulp2mutatedcontent = []
    content_type = content_model.pulp2.TYPE_ID
    set_pulp2_repo = content_model.pulp_2to3_detail.set_pulp2_repo

    delete_removed_pulp2_content(content_model)

    # the latest timestamp we have in the migration tool Pulp2Content table for this content type
    content_qs = Pulp2Content.objects.filter(pulp2_content_type_id=content_type)
    last_updated = content_qs.aggregate(Max('pulp2_last_updated'))['pulp2_last_updated__max'] or 0
    _logger.debug('The latest migrated {type} content has {timestamp} timestamp.'.format(
        type=content_type,
        timestamp=last_updated))

    query_args = {}
    if premigrate_hook:
        pulp2_content_ids = premigrate_hook()
        query_args["id__in"] = pulp2_content_ids

    mongo_content_qs = content_model.pulp2.objects(
        _last_updated__gte=last_updated, **query_args
    ).order_by("_last_updated")

    total_content = mongo_content_qs.count()
    _logger.debug('Total count for {type} content to migrate: {total}'.format(
        type=content_type,
        total=total_content))

    pulp2content_pb = ProgressReport(
        message='Pre-migrating Pulp 2 {} content (general info)'.format(content_type),
        code='premigrating.content.general',
        total=total_content,
        state=TASK_STATES.RUNNING)
    pulp2content_pb.save()
    pulp2detail_pb = ProgressReport(
        message='Pre-migrating Pulp 2 {} content (detail info)'.format(content_type),
        code='premigrating.content.detail',
        total=total_content,
        state=TASK_STATES.RUNNING)
    pulp2detail_pb.save()
    existing_count = 0

    if mutable_type:
        pulp2_content_ids = []

        for c in mongo_content_qs.only('id', '_last_updated').no_cache().as_pymongo():
            if c['_last_updated'] == last_updated:
                if Pulp2Content.objects.filter(
                        pulp2_last_updated=last_updated, pulp2_id=c['_id']).exists():
                    continue

                pulp2_content_ids.append(c['_id'])

        # This is a mutable content type. Query for the existing pulp2content.
        # If any was found, it means that the migrated content is older than the incoming.
        # Delete outdated migrated pulp2content and create a new pulp2content
        outdated = Pulp2Content.objects.filter(pulp2_id__in=pulp2_content_ids)
        if outdated.exists():
            pulp2mutatedcontent.extend(pulp2_content_ids)
        outdated.delete()

    mongo_fields = set(['id', '_storage_path', '_last_updated', '_content_type_id'])
    if hasattr(content_model.pulp2, 'downloaded'):
        mongo_fields.add('downloaded')

    batched_mongo_content_qs = mongo_content_qs.only(*mongo_fields).batch_size(batch_size)
    for i, record in enumerate(batched_mongo_content_qs.no_cache()):
        if record._last_updated == last_updated:
            # corner case - content with the last``last_updated`` date might be pre-migrated;
            # check if this content is already pre-migrated
            migrated = Pulp2Content.objects.filter(pulp2_last_updated=last_updated,
                                                   pulp2_id=record.id)
            if migrated.exists():
                existing_count += 1

                # it has to be updated here and not later, in case all items were migrated before
                # and no new content will be saved.
                pulp2content_pb.total -= 1
                pulp2detail_pb.total -= 1
                continue

        downloaded = record.downloaded if hasattr(record, 'downloaded') else False

        if set_pulp2_repo:
            # This content requires to set pulp 2 repo. E.g. for errata, because 1 pulp2
            # content unit is converted into N pulp3 content units and repo_id is the only
            # way to have unique records for those.
            content_relations = Pulp2RepoContent.objects.filter(
                pulp2_unit_id=record.id,
                pulp2_content_type_id=record._content_type_id,
                pulp2_repository__not_in_plan=False,
            ).select_related(
                'pulp2_repository'
            ).only(
                'pulp2_repository'
            )
            for relation in content_relations.iterator():
                item = Pulp2Content(
                    pulp2_id=record.id,
                    pulp2_content_type_id=record._content_type_id,
                    pulp2_last_updated=record._last_updated,
                    pulp2_storage_path=record._storage_path,
                    downloaded=downloaded,
                    pulp2_repo=relation.pulp2_repository,
                )
                _logger.debug(
                    'Add content item to the list to migrate: {item}'.format(item=item))
                pulp2content.append(item)
                pulp2content_pb.total += 1
                pulp2detail_pb.total += 1

            # total needs to be adjusted, proper counting happened in the loop above,
            # so we subtract one because this content is also a part of initial 'total' counter.
            pulp2content_pb.total -= 1
            pulp2detail_pb.total -= 1
        else:
            item = Pulp2Content(
                pulp2_id=record.id,
                pulp2_content_type_id=record._content_type_id,
                pulp2_last_updated=record._last_updated,
                pulp2_storage_path=record._storage_path,
                downloaded=downloaded
            )
            _logger.debug('Add content item to the list to migrate: {item}'.format(item=item))
            pulp2content.append(item)

        # determine if the batch needs to be saved, also take into account whether there is
        # anything in the pulp2content to be saved
        save_batch = pulp2content and (len(pulp2content) >= batch_size or i == total_content - 1)
        if save_batch:
            _logger.debug(
                'Bulk save for generic content info, saved so far: {index}'.format(index=i + 1)
            )
            pulp2content_batch = Pulp2Content.objects.bulk_create(pulp2content,
                                                                  ignore_conflicts=True)

            # bulk_create(ignore_conflicts=True) hands back the same item-set we passed in,
            # *even if* it decided to update an existing db-record rather than creating a new
            # one with the passed-in PK. As a result, we can't trust pulp2content_batch to
            # have the 'right' PKs (i.e., the in-memory p2content_batch doesn't match the
            # db-reality). This causes the pre_migrate_content_detail() below to fail as it
            # attempts to create detail-records for the Pulp2Content records it's been handed.
            # THEREFORE - we need to find the 'real' IDs of everything in p2content-batch based
            # on its uniqueness-fields and update the in-memory list with them.
            for p2c in pulp2content_batch:
                filter_q = Q(
                    pulp2_content_type_id=content_type,
                    pulp2_id=p2c.pulp2_id,
                    pulp2_repo=p2c.pulp2_repo,
                    pulp2_subid=p2c.pulp2_subid,
                )
                p2c_db = Pulp2Content.objects.get(filter_q)
                p2c.pulp_id = p2c_db.pulp_id

            content_saved = len(pulp2content_batch) - existing_count
            pulp2content_pb.done += content_saved
            pulp2content_pb.save()

            content_model.pulp_2to3_detail.pre_migrate_content_detail(pulp2content_batch)

            pulp2detail_pb.done += content_saved
            pulp2detail_pb.save()

            pulp2content.clear()
            existing_count = 0

    # If it's a per-repo content type and it's a migration re-run, we need to make sure that the
    # existing content hasn't been associated with a new repo since our last migration,
    # and if so, we need to go back and create a Pulp2Content for these new relations.
    # E.g. errata copied from one repo to another in Pulp 2, in such cases _last_updated is
    # unchanged.
    if set_pulp2_repo and last_updated:
        # last_updated is a unix timestamp, we need to convert it to use in our Django query.
        last_updated = datetime.utcfromtimestamp(last_updated)

        # Query all new relations for that content since the last run
        content_relations = Pulp2RepoContent.objects.filter(
            pulp2_content_type_id=content_type,
            pulp2_repository__not_in_plan=False,
            pulp2_created__gte=last_updated
        ).select_related(
            'pulp2_repository'
        ).only(
            'pulp2_repository', 'pulp2_created',
        ).order_by('pulp2_created')

        mongo_content_qs = content_model.pulp2.objects(
            id__in=content_relations.values_list('pulp2_unit_id', flat=True))
        pulp2_content_by_id = {
            record.id: record for record in mongo_content_qs.only(*mongo_fields).no_cache()
        }

        for relation in content_relations:
            record = pulp2_content_by_id[relation.pulp2_unit_id]
            downloaded = record.downloaded if hasattr(record, 'downloaded') else False
            specific_content_q = Q(
                pulp2_content_type_id=record._content_type_id,
                pulp2_id=record.id,
                pulp2_repo=relation.pulp2_repository,
                pulp2_subid='',
            )

            # Ensure that no existing pulp2content slipped into bulk_create.
            # Otherwise, we'll have a problem with later bulk_create for detail models.
            if Pulp2Content.objects.filter(specific_content_q).exists():
                continue

            item = Pulp2Content(
                pulp2_id=record.id,
                pulp2_content_type_id=record._content_type_id,
                # Set `pulp2_last_updated` to the date of when a content unit got copied.
                # (We can't set it to anything higher, in case pre-migration crashes and we would
                # need to pick it up correctly on the next re-run.)
                # When erratum is copied in pulp 2, it doesn't change its _last_updated timestamp.
                # It means that Katello has no way to identify that the erratum has been copied
                # since the last migration run, without reimporting all errata, which is expensive.
                pulp2_last_updated=int(relation.pulp2_created.timestamp()),
                pulp2_storage_path=record._storage_path,
                downloaded=downloaded,
                pulp2_repo=relation.pulp2_repository
            )
            _logger.debug(
                'Add content item to the list to migrate: {item}'.format(item=item))
            pulp2content.append(item)
            pulp2content_pb.total += 1
            pulp2detail_pb.total += 1

        pulp2content_batch = Pulp2Content.objects.bulk_create(pulp2content)
        pulp2content_pb.done += len(pulp2content_batch)
        pulp2content_pb.save()

        content_model.pulp_2to3_detail.pre_migrate_content_detail(pulp2content_batch)

        pulp2detail_pb.done += len(pulp2content_batch)
        pulp2detail_pb.save()

    pulp2content_pb.save()
    pulp2detail_pb.save()

    if pulp2mutatedcontent:
        # when we flip the is_migrated flag to False, we base this decision on the last_unit_added
        # https://github.com/pulp/pulp-2to3-migration/blob/master/pulp_2to3_migration/app/pre_migration.py#L279  # noqa
        # in this case, we still need to update the is_migrated flag manually because of errata.
        # in pulp2 sync and copy cases of updated errata are not covered
        # only when uploading errata last_unit_added is updated on all the repos that contain it
        mutated_content = Pulp2RepoContent.objects.filter(pulp2_unit_id__in=pulp2mutatedcontent)
        repo_to_update_ids = mutated_content.values_list(
            'pulp2_repository_id', flat=True).distinct()
        Pulp2Repository.objects.filter(pk__in=repo_to_update_ids).update(is_migrated=False)

    if lazy_type:
        pre_migrate_lazycatalog(content_type)

    pulp2content_pb.state = TASK_STATES.COMPLETED
    pulp2content_pb.save()
    pulp2detail_pb.state = TASK_STATES.COMPLETED
    pulp2detail_pb.save()
def pre_migrate_content_type(content_model, mutable_type, lazy_type,
                             premigrate_hook):
    """
    A coroutine to pre-migrate Pulp 2 content, including all details for on_demand content.

    Args:
        content_model: Models for content which is being migrated.
        mutable_type: Boolean that indicates whether the content type is mutable.
    """
    batch_size = 100
    pulp2content = []
    pulp2mutatedcontent = []
    content_type = content_model.pulp2.TYPE_ID
    set_pulp2_repo = content_model.pulp_2to3_detail.set_pulp2_repo

    # the latest timestamp we have in the migration tool Pulp2Content table for this content type
    content_qs = Pulp2Content.objects.filter(
        pulp2_content_type_id=content_type)
    last_updated = content_qs.aggregate(
        Max('pulp2_last_updated'))['pulp2_last_updated__max'] or 0
    _logger.debug(
        'The latest migrated {type} content has {timestamp} timestamp.'.format(
            type=content_type, timestamp=last_updated))

    query_args = {}
    if premigrate_hook:
        pulp2_content_ids = premigrate_hook()
        query_args["id__in"] = pulp2_content_ids

    mongo_content_qs = content_model.pulp2.objects(
        _last_updated__gte=last_updated,
        **query_args).order_by("_last_updated")

    total_content = mongo_content_qs.count()
    _logger.debug('Total count for {type} content to migrate: {total}'.format(
        type=content_type, total=total_content))

    pulp2content_pb = ProgressReport(
        message='Pre-migrating Pulp 2 {} content (general info)'.format(
            content_type.upper()),
        code='premigrating.content.general',
        total=total_content,
        state=TASK_STATES.RUNNING)
    pulp2content_pb.save()
    pulp2detail_pb = ProgressReport(
        message='Pre-migrating Pulp 2 {} content (detail info)'.format(
            content_type.upper()),
        code='premigrating.content.detail',
        total=total_content,
        state=TASK_STATES.RUNNING)
    pulp2detail_pb.save()
    existing_count = 0

    if mutable_type:
        pulp2_content_ids = []

        for c in mongo_content_qs.only(
                'id', '_last_updated').no_cache().as_pymongo():
            if c['_last_updated'] == last_updated:
                if Pulp2Content.objects.filter(pulp2_last_updated=last_updated,
                                               pulp2_id=c['_id']).exists():
                    continue

                pulp2_content_ids.append(c['_id'])

        # This is a mutable content type. Query for the existing pulp2content.
        # If any was found, it means that the migrated content is older than the incoming.
        # Delete outdated migrated pulp2content and create a new pulp2content
        outdated = Pulp2Content.objects.filter(pulp2_id__in=pulp2_content_ids)
        if outdated.exists():
            pulp2mutatedcontent.extend(pulp2_content_ids)
        outdated.delete()

    mongo_fields = set(
        ['id', '_storage_path', '_last_updated', '_content_type_id'])
    if hasattr(content_model.pulp2, 'downloaded'):
        mongo_fields.add('downloaded')

    batched_mongo_content_qs = mongo_content_qs.only(
        *mongo_fields).batch_size(batch_size)
    for i, record in enumerate(batched_mongo_content_qs.no_cache()):
        if record._last_updated == last_updated:
            # corner case - content with the last``last_updated`` date might be pre-migrated;
            # check if this content is already pre-migrated
            migrated = Pulp2Content.objects.filter(
                pulp2_last_updated=last_updated, pulp2_id=record.id)
            if migrated.exists():
                existing_count += 1

                # it has to be updated here and not later, in case all items were migrated before
                # and no new content will be saved.
                pulp2content_pb.total -= 1
                pulp2detail_pb.total -= 1
                continue

        downloaded = record.downloaded if hasattr(record,
                                                  'downloaded') else False

        if set_pulp2_repo:
            # This content requires to set pulp 2 repo. E.g. for errata, because 1 pulp2
            # content unit is converted into N pulp3 content units and repo_id is the only
            # way to have unique records for those.
            content_relations = Pulp2RepoContent.objects.filter(
                pulp2_unit_id=record.id,
                pulp2_content_type_id=record._content_type_id,
                pulp2_repository__not_in_plan=False,
            ).select_related('pulp2_repository').only('pulp2_repository')
            for relation in content_relations.iterator():
                item = Pulp2Content(
                    pulp2_id=record.id,
                    pulp2_content_type_id=record._content_type_id,
                    pulp2_last_updated=record._last_updated,
                    pulp2_storage_path=record._storage_path,
                    downloaded=downloaded,
                    pulp2_repo=relation.pulp2_repository)
                _logger.debug(
                    'Add content item to the list to migrate: {item}'.format(
                        item=item))
                pulp2content.append(item)
                pulp2content_pb.total += 1
                pulp2detail_pb.total += 1

            # total needs to be adjusted, proper counting happened in the loop above,
            # so we subtract one because this content is also a part of initial 'total' counter.
            pulp2content_pb.total -= 1
            pulp2detail_pb.total -= 1
        else:
            item = Pulp2Content(pulp2_id=record.id,
                                pulp2_content_type_id=record._content_type_id,
                                pulp2_last_updated=record._last_updated,
                                pulp2_storage_path=record._storage_path,
                                downloaded=downloaded)
            _logger.debug(
                'Add content item to the list to migrate: {item}'.format(
                    item=item))
            pulp2content.append(item)

        # determine if the batch needs to be saved, also take into account whether there is
        # anything in the pulp2content to be saved
        save_batch = pulp2content and (len(pulp2content) >= batch_size
                                       or i == total_content - 1)
        if save_batch:
            _logger.debug(
                'Bulk save for generic content info, saved so far: {index}'.
                format(index=i + 1))
            pulp2content_batch = Pulp2Content.objects.bulk_create(
                pulp2content, ignore_conflicts=True)
            content_saved = len(pulp2content_batch) - existing_count
            pulp2content_pb.done += content_saved
            pulp2content_pb.save()

            content_model.pulp_2to3_detail.pre_migrate_content_detail(
                pulp2content_batch)

            pulp2detail_pb.done += content_saved
            pulp2detail_pb.save()

            pulp2content.clear()
            existing_count = 0

    pulp2content_pb.save()
    pulp2detail_pb.save()

    if pulp2mutatedcontent:
        # when we flip the is_migrated flag to False, we base this decision on the last_unit_added
        # https://github.com/pulp/pulp-2to3-migration/blob/master/pulp_2to3_migration/app/pre_migration.py#L279  # noqa
        # in this case, we still need to update the is_migrated flag manually because of errata.
        # in pulp2 sync and copy cases of updated errata are not covered
        # only when uploading errata last_unit_added is updated on all the repos that contain it
        mutated_content = Pulp2RepoContent.objects.filter(
            pulp2_unit_id__in=pulp2mutatedcontent)
        repo_to_update_ids = mutated_content.values_list('pulp2_repository_id',
                                                         flat=True).distinct()
        Pulp2Repository.objects.filter(pk__in=repo_to_update_ids).update(
            is_migrated=False)

    if lazy_type:
        pre_migrate_lazycatalog(content_type)

    pulp2content_pb.state = TASK_STATES.COMPLETED
    pulp2content_pb.save()
    pulp2detail_pb.state = TASK_STATES.COMPLETED
    pulp2detail_pb.save()