Beispiel #1
0
class CollectionSyncFirstStage(Stage):
    """
    The first stage of a pulp_ansible sync pipeline.
    """
    def __init__(self, remote, repository, optimize):
        """
        The first stage of a pulp_ansible sync pipeline.

        Args:
            remote (CollectionRemote): The remote data to be used when syncing
            repository (AnsibleRepository): The repository being syncedself.
            optimize (boolean): Whether to optimize sync or not.

        """
        super().__init__()
        msg = _("Parsing CollectionVersion Metadata")
        self.parsing_metadata_progress_bar = ProgressReport(
            message=msg, code="parsing.metadata")
        self.remote = remote
        self.repository = repository
        self.optimize = optimize
        self.collection_info = parse_collections_requirements_file(
            remote.requirements_file)
        self.deprecations = Q()
        self.add_dependents = self.collection_info and self.remote.sync_dependencies
        self.already_synced = set()
        self._unpaginated_collection_metadata = None
        self._unpaginated_collection_version_metadata = None
        self.last_synced_metadata_time = None

        # Interpret download policy
        self.deferred_download = self.remote.policy != Remote.IMMEDIATE

    @alru_cache(maxsize=128)
    async def _get_root_api(self, root):
        """
        Returns the root api path and api version.

        Based on https://git.io/JTMxE.
        """
        if root == "https://galaxy.ansible.com" or root == "https://galaxy.ansible.com/":
            root = "https://galaxy.ansible.com/api/"

        downloader = self.remote.get_downloader(url=root)

        try:
            api_data = parse_metadata(await downloader.run())
        except (json.decoder.JSONDecodeError, ClientResponseError):
            if root.endswith("/api/"):
                raise

            root = urljoin(root, "api/")
            downloader = self.remote.get_downloader(url=root)
            api_data = parse_metadata(await downloader.run())

        if "available_versions" not in api_data:
            raise RuntimeError(
                _("Could not find 'available_versions' at {}").format(root))

        if "v3" in api_data.get("available_versions", {}):
            api_version = 3
        elif "v2" in api_data.get("available_versions", {}):
            api_version = 2
        else:
            raise RuntimeError(
                _("Unsupported API versions at {}").format(root))

        endpoint = f"{root}v{api_version}"

        return endpoint, api_version

    @alru_cache(maxsize=128)
    async def _get_paginated_collection_api(self, root):
        """
        Returns the collection api path and api version.

        Based on https://git.io/JTMxE.
        """
        endpoint, api_version = await self._get_root_api(root)
        return f"{endpoint}/collections/", api_version

    async def _fetch_collection_version_metadata(self, api_version,
                                                 collection_version_url):
        downloader = self.remote.get_downloader(url=collection_version_url)
        metadata = parse_metadata(await downloader.run())
        await self._add_collection_version(api_version, collection_version_url,
                                           metadata)

    async def _add_collection_version(self, api_version,
                                      collection_version_url, metadata):
        """Add CollectionVersion to the sync pipeline."""
        url = metadata["download_url"]
        collection_version = CollectionVersion(
            namespace=metadata["namespace"]["name"],
            name=metadata["collection"]["name"],
            version=metadata["version"],
        )
        cv_unique = attrgetter("namespace", "name",
                               "version")(collection_version)
        if cv_unique in self.already_synced:
            return
        self.already_synced.add(cv_unique)

        info = metadata["metadata"]

        if self.add_dependents:
            dependencies = info["dependencies"]
            tasks = []
            loop = asyncio.get_event_loop()
            for full_name, version in dependencies.items():
                namespace, name = full_name.split(".")
                if not (namespace, name, version) in self.already_synced:
                    new_req = RequirementsFileEntry(
                        name=full_name,
                        version=version,
                        source=None,
                    )
                    tasks.append(
                        loop.create_task(
                            self._fetch_collection_metadata(new_req)))
            await asyncio.gather(*tasks)

        info.pop("tags")
        for attr_name, attr_value in info.items():
            if attr_value is None or attr_name not in collection_version.__dict__:
                continue
            setattr(collection_version, attr_name, attr_value)

        artifact = metadata["artifact"]

        d_artifact = DeclarativeArtifact(
            artifact=Artifact(sha256=artifact["sha256"],
                              size=artifact["size"]),
            url=url,
            relative_path=collection_version.relative_path,
            remote=self.remote,
            deferred_download=self.deferred_download,
        )

        extra_data = {}
        if api_version != 2:  # V2 never implemented the docs-blob requests
            extra_data["docs_blob_url"] = f"{collection_version_url}docs-blob/"

        d_content = DeclarativeContent(
            content=collection_version,
            d_artifacts=[d_artifact],
            extra_data=extra_data,
        )
        self.parsing_metadata_progress_bar.increment()
        await self.put(d_content)

    def _collection_versions_list_downloader(self, api_version,
                                             collection_endpoint, namespace,
                                             name, page_num, page_size):
        url_without_get_params = f"{collection_endpoint}{namespace}/{name}/versions/"
        if api_version == 2:
            versions_list_url = f"{url_without_get_params}?page={page_num}&page_size={page_size}"
        else:
            offset = (page_num - 1) * page_size
            versions_list_url = f"{url_without_get_params}?limit={page_size}&offset={offset}"
        return self.remote.get_downloader(url=versions_list_url)

    async def _fetch_paginated_collection_metadata(self,
                                                   name,
                                                   namespace,
                                                   requirement,
                                                   source=None):
        root = source or self.remote.url
        collection_endpoint, api_version = await self._get_paginated_collection_api(
            root)
        collection_url = f"{collection_endpoint}{namespace}/{name}"
        collection_metadata_downloader = self.remote.get_downloader(
            url=collection_url)
        collection_metadata = parse_metadata(
            await collection_metadata_downloader.run())
        loop = asyncio.get_event_loop()

        tasks = []
        page_num = 1
        while True:
            versions_list_downloader = self._collection_versions_list_downloader(
                api_version, collection_endpoint, namespace, name, page_num,
                PAGE_SIZE)
            collection_versions_list = parse_metadata(
                await versions_list_downloader.run())
            if api_version == 2:
                collection_versions = collection_versions_list["results"]
            else:
                collection_versions = collection_versions_list["data"]
            for collection_version in collection_versions:
                if collection_version["version"] in requirement:
                    version_num = collection_version["version"]
                    collection_version_detail_url = f"{collection_url}/versions/{version_num}/"
                    if collection_metadata["deprecated"]:
                        self.deprecations |= Q(namespace=namespace, name=name)
                    tasks.append(
                        loop.create_task(
                            self._fetch_collection_version_metadata(
                                api_version,
                                collection_version_detail_url,
                            )))
            next_value = self._get_response_next_value(
                api_version, collection_versions_list)
            if not next_value:
                break
            page_num = page_num + 1

        await asyncio.gather(*tasks)

    async def _read_from_downloaded_metadata(self, name, namespace,
                                             requirement):
        tasks = []
        loop = asyncio.get_event_loop()

        if self._unpaginated_collection_metadata[namespace][name][
                "deprecated"]:
            self.deprecations |= Q(namespace=namespace, name=name)

        all_versions_of_collection = self._unpaginated_collection_version_metadata[
            namespace][name]

        for col_version_metadata in all_versions_of_collection:
            if col_version_metadata["version"] in requirement:
                collection_version_url = urljoin(
                    self.remote.url, f"{col_version_metadata['href']}")
                tasks.append(
                    loop.create_task(
                        self._add_collection_version(self._api_version,
                                                     collection_version_url,
                                                     col_version_metadata)))
        await asyncio.gather(*tasks)

    async def _fetch_collection_metadata(self, requirements_entry):
        if requirements_entry.version == "*":
            requirement_version = Requirement.parse("collection")
        else:
            requirement_version = Requirement.parse(
                f"collection{requirements_entry.version}")

        namespace, name = requirements_entry.name.split(".")

        if self._unpaginated_collection_version_metadata and requirements_entry.source is None:
            await self._read_from_downloaded_metadata(name, namespace,
                                                      requirement_version)
        else:
            await self._fetch_paginated_collection_metadata(
                name, namespace, requirement_version,
                requirements_entry.source)

    @staticmethod
    def _get_response_next_value(api_version, response):
        if api_version == 2:
            return response["next"]
        else:
            return response["links"]["next"]

    def _collection_list_downloader(self, api_version, collection_endpoint,
                                    page_num, page_size):
        if api_version == 2:
            collection_list_url = f"{collection_endpoint}?page={page_num}&page_size={page_size}"
        else:
            offset = (page_num - 1) * page_size
            collection_list_url = f"{collection_endpoint}?limit={page_size}&offset={offset}"
        return self.remote.get_downloader(url=collection_list_url)

    async def _download_unpaginated_metadata(self):
        root_endpoint, api_version = await self._get_root_api(self.remote.url)
        self._api_version = api_version
        if api_version > 2:
            collection_endpoint = f"{root_endpoint}/collections/all/"
            downloader = self.remote.get_downloader(
                url=collection_endpoint,
                silence_errors_for_response_status_codes={404})
            try:
                collection_metadata_list = parse_metadata(await
                                                          downloader.run())
            except FileNotFoundError:
                pass
            else:
                self._unpaginated_collection_metadata = defaultdict(dict)
                for collection in collection_metadata_list:
                    namespace = collection["namespace"]
                    name = collection["name"]
                    self._unpaginated_collection_metadata[namespace][
                        name] = collection

                collection_version_endpoint = f"{root_endpoint}/collection_versions/all/"
                downloader = self.remote.get_downloader(
                    url=collection_version_endpoint)
                collection_version_metadata_list = parse_metadata(
                    await downloader.run())

                self._unpaginated_collection_version_metadata = defaultdict(
                    lambda: defaultdict(list))
                for collection_version_metadata in collection_version_metadata_list:
                    namespace = collection_version_metadata["namespace"][
                        "name"]
                    name = collection_version_metadata["name"]
                    self._unpaginated_collection_version_metadata[namespace][
                        name].append(collection_version_metadata)

    async def _find_all_collections_from_unpaginated_data(self):
        tasks = []
        loop = asyncio.get_event_loop()

        for collection_namespace_dict in self._unpaginated_collection_metadata.values(
        ):
            for collection in collection_namespace_dict.values():
                if collection["deprecated"]:
                    self.deprecations |= Q(namespace=collection["namespace"],
                                           name=collection["name"])

        for collections_in_namespace in self._unpaginated_collection_version_metadata.values(
        ):
            for collection_versions in collections_in_namespace.values():
                for collection_version in collection_versions:
                    collection_version_url = urljoin(
                        self.remote.url, f"{collection_version['href']}")
                    tasks.append(
                        loop.create_task(
                            self._add_collection_version(
                                self._api_version, collection_version_url,
                                collection_version)))

        await asyncio.gather(*tasks)

    async def _find_all_collections(self):
        if self._unpaginated_collection_version_metadata:
            await self._find_all_collections_from_unpaginated_data()
            return

        collection_endpoint, api_version = await self._get_paginated_collection_api(
            self.remote.url)
        loop = asyncio.get_event_loop()

        tasks = []
        page_num = 1
        while True:
            collection_list_downloader = self._collection_list_downloader(
                api_version, collection_endpoint, page_num, PAGE_SIZE)
            collection_list = parse_metadata(await
                                             collection_list_downloader.run())

            if api_version == 2:
                collections = collection_list["results"]
            else:
                collections = collection_list["data"]

            for collection in collections:
                if api_version == 2:
                    namespace = collection["namespace"]["name"]
                else:
                    namespace = collection["namespace"]
                name = collection["name"]
                requirements_file = RequirementsFileEntry(
                    name=".".join([namespace, name]),
                    version="*",
                    source=None,
                )
                tasks.append(
                    loop.create_task(
                        self._fetch_collection_metadata(requirements_file)))

            next_value = self._get_response_next_value(api_version,
                                                       collection_list)
            if not next_value:
                break
            page_num = page_num + 1

        await asyncio.gather(*tasks)

    async def _should_we_sync(self):
        """Check last synced metadata time."""
        msg = _("no-op: Checking if remote changed since last sync.")
        noop = ProgressReport(message=msg, code="noop")
        noop.state = TASK_STATES.COMPLETED
        noop.save()

        if not self.repository.remote:
            return True

        if self.remote != self.repository.remote.cast():
            return True

        root, api_version = await self._get_root_api(self.remote.url)
        if api_version == 3:
            downloader = self.remote.get_downloader(
                url=root, silence_errors_for_response_status_codes={404})
            try:
                metadata = parse_metadata(await downloader.run())
            except FileNotFoundError:
                return True

            try:
                self.last_synced_metadata_time = parse_datetime(
                    metadata["published"])
            except KeyError:
                return True

            sources = set()
            if self.collection_info:
                sources = {r.source for r in self.collection_info if r.source}
            sources.add(self.remote.url)
            if len(sources) > 1:
                return True

            if self.last_synced_metadata_time == self.repository.last_synced_metadata_time:
                noop.message = _(
                    "no-op: {remote} did not change since last sync - {published}"
                    .format(remote=self.remote.url,
                            published=self.last_synced_metadata_time))
                noop.save()
                return False

        return True

    async def run(self):
        """
        Build and emit `DeclarativeContent` from the ansible metadata.
        """
        if self.optimize:
            should_we_sync = await self._should_we_sync()
            if should_we_sync is False:
                log.debug(_("no-op: remote wasn't updated since last sync."))
                return

        tasks = []
        loop = asyncio.get_event_loop()

        await self._download_unpaginated_metadata()

        if self.collection_info:
            for requirement_entry in self.collection_info:
                tasks.append(
                    loop.create_task(
                        self._fetch_collection_metadata(requirement_entry)))
        else:
            tasks.append(loop.create_task(self._find_all_collections()))
        await asyncio.gather(*tasks)
        self.parsing_metadata_progress_bar.state = TASK_STATES.COMPLETED
        self.parsing_metadata_progress_bar.save()
Beispiel #2
0
    async def run(self):
        """
        ContainerFirstStage.
        """
        future_manifests = []
        tag_list = []
        to_download = []
        man_dcs = {}
        total_blobs = []

        with ProgressReport(
            message='Downloading tag list', code='downloading.tag_list', total=1
        ) as pb:
            repo_name = self.remote.namespaced_upstream_name
            relative_url = '/v2/{name}/tags/list'.format(name=repo_name)
            tag_list_url = urljoin(self.remote.url, relative_url)
            list_downloader = self.remote.get_downloader(url=tag_list_url)
            await list_downloader.run(extra_data={'repo_name': repo_name})

            with open(list_downloader.path) as tags_raw:
                tags_dict = json.loads(tags_raw.read())
                tag_list = tags_dict['tags']

            # check for the presence of the pagination link header
            link = list_downloader.response_headers.get('Link')
            await self.handle_pagination(link, repo_name, tag_list)
            tag_list = self.filter_tags(tag_list)
            pb.increment()

        for tag_name in tag_list:
            relative_url = '/v2/{name}/manifests/{tag}'.format(
                name=self.remote.namespaced_upstream_name,
                tag=tag_name,
            )
            url = urljoin(self.remote.url, relative_url)
            downloader = self.remote.get_downloader(url=url)
            to_download.append(downloader.run(extra_data={'headers': V2_ACCEPT_HEADERS}))

        pb_parsed_tags = ProgressReport(
            message='Processing Tags',
            code='processing.tag',
            state=TASK_STATES.RUNNING,
            total=len(tag_list)
        )

        for download_tag in asyncio.as_completed(to_download):
            tag = await download_tag
            with open(tag.path, 'rb') as content_file:
                raw_data = content_file.read()
            content_data = json.loads(raw_data)
            media_type = content_data.get('mediaType')
            tag.artifact_attributes['file'] = tag.path
            saved_artifact = Artifact(**tag.artifact_attributes)
            try:
                saved_artifact.save()
            except IntegrityError:
                del tag.artifact_attributes['file']
                saved_artifact = Artifact.objects.get(**tag.artifact_attributes)
            tag_dc = self.create_tag(saved_artifact, tag.url)

            if media_type in (MEDIA_TYPE.MANIFEST_LIST, MEDIA_TYPE.INDEX_OCI):
                list_dc = self.create_tagged_manifest_list(
                    tag_dc, content_data)
                await self.put(list_dc)
                tag_dc.extra_data['man_relation'] = list_dc
                for manifest_data in content_data.get('manifests'):
                    man_dc = self.create_manifest(list_dc, manifest_data)
                    future_manifests.append(man_dc)
                    man_dcs[man_dc.content.digest] = man_dc
                    await self.put(man_dc)
            else:
                man_dc = self.create_tagged_manifest(tag_dc, content_data, raw_data)
                await self.put(man_dc)
                tag_dc.extra_data['man_relation'] = man_dc
                self.handle_blobs(man_dc, content_data, total_blobs)
            await self.put(tag_dc)
            pb_parsed_tags.increment()

        pb_parsed_tags.state = 'completed'
        pb_parsed_tags.save()

        for manifest_future in future_manifests:
            man = await manifest_future.resolution()
            with man._artifacts.get().file.open() as content_file:
                raw = content_file.read()
            content_data = json.loads(raw)
            man_dc = man_dcs[man.digest]
            self.handle_blobs(man_dc, content_data, total_blobs)
        for blob in total_blobs:
            await self.put(blob)
Beispiel #3
0
    async def run(self):
        """
        Build `DeclarativeContent` from the repodata.
        """
        packages_pb = ProgressReport(message='Parsed Packages',
                                     code='parsing.packages')
        errata_pb = ProgressReport(message='Parsed Erratum',
                                   code='parsing.errata')
        modulemd_pb = ProgressReport(message='Parse Modulemd',
                                     code='parsing.modulemds')
        modulemd_defaults_pb = ProgressReport(
            message='Parse Modulemd-defaults', code='parsing.modulemddefaults')
        comps_pb = ProgressReport(message='Parsed Comps', code='parsing.comps')

        packages_pb.save()
        errata_pb.save()
        comps_pb.save()

        remote_url = self.new_url or self.remote.url
        remote_url = remote_url if remote_url[-1] == "/" else f"{remote_url}/"

        progress_data = dict(message='Downloading Metadata Files',
                             code='downloading.metadata')
        with ProgressReport(**progress_data) as metadata_pb:
            downloader = self.remote.get_downloader(
                url=urljoin(remote_url, 'repodata/repomd.xml'))
            # TODO: decide how to distinguish between a mirror list and a normal repo
            result = await downloader.run()
            metadata_pb.increment()

            if self.kickstart:
                d_artifacts = []
                for path, checksum in self.kickstart["download"][
                        "images"].items():
                    artifact = Artifact(**checksum)

                    da = DeclarativeArtifact(
                        artifact=artifact,
                        url=urljoin(remote_url, path),
                        relative_path=path,
                        remote=self.remote,
                        deferred_download=self.deferred_download)

                    d_artifacts.append(da)

                distribution_tree = DistributionTree(
                    **self.kickstart["distribution_tree"])
                dc = DeclarativeContent(content=distribution_tree,
                                        d_artifacts=d_artifacts)
                dc.extra_data = self.kickstart
                await self.put(dc)

            repomd_path = result.path
            repomd = cr.Repomd(repomd_path)
            package_repodata_urls = {}
            downloaders = []
            modulemd_list = list()
            dc_groups = []
            dc_categories = []
            dc_environments = []
            nevra_to_module = defaultdict(dict)
            pkgname_to_groups = defaultdict(list)
            group_to_categories = defaultdict(list)
            group_to_environments = defaultdict(list)
            optionalgroup_to_environments = defaultdict(list)
            modulemd_results = None
            comps_downloader = None

            for record in repomd.records:
                if record.type in PACKAGE_REPODATA:
                    package_repodata_urls[record.type] = urljoin(
                        remote_url, record.location_href)
                elif record.type in UPDATE_REPODATA:
                    updateinfo_url = urljoin(remote_url, record.location_href)
                    downloader = self.remote.get_downloader(url=updateinfo_url)
                    downloaders.append([downloader.run()])

                elif record.type in COMPS_REPODATA:
                    comps_url = urljoin(remote_url, record.location_href)
                    comps_downloader = self.remote.get_downloader(
                        url=comps_url)

                elif record.type in SKIP_REPODATA:
                    continue

                elif record.type in MODULAR_REPODATA:
                    modules_url = urljoin(remote_url, record.location_href)
                    modulemd_downloader = self.remote.get_downloader(
                        url=modules_url)
                    modulemd_results = await modulemd_downloader.run()

                elif record.type not in PACKAGE_DB_REPODATA:
                    file_data = {
                        record.checksum_type: record.checksum,
                        "size": record.size
                    }
                    da = DeclarativeArtifact(
                        artifact=Artifact(**file_data),
                        url=urljoin(remote_url, record.location_href),
                        relative_path=record.location_href,
                        remote=self.remote,
                        deferred_download=False)
                    repo_metadata_file = RepoMetadataFile(
                        data_type=record.type,
                        checksum_type=record.checksum_type,
                        checksum=record.checksum,
                    )
                    dc = DeclarativeContent(content=repo_metadata_file,
                                            d_artifacts=[da])
                    await self.put(dc)

            # we have to sync module.yaml first if it exists, to make relations to packages
            if modulemd_results:
                modulemd_index = mmdlib.ModuleIndex.new()
                open_func = gzip.open if modulemd_results.url.endswith(
                    '.gz') else open
                with open_func(modulemd_results.path, 'r') as moduleyaml:
                    modulemd_index.update_from_string(
                        moduleyaml.read().decode(), True)

                modulemd_names = modulemd_index.get_module_names() or []
                modulemd_all = parse_modulemd(modulemd_names, modulemd_index)

                modulemd_pb.total = len(modulemd_all)
                modulemd_pb.state = 'running'
                modulemd_pb.save()

                for modulemd in modulemd_all:
                    artifact = modulemd.pop('artifact')
                    relative_path = '{}{}{}{}{}snippet'.format(
                        modulemd[PULP_MODULE_ATTR.NAME],
                        modulemd[PULP_MODULE_ATTR.STREAM],
                        modulemd[PULP_MODULE_ATTR.VERSION],
                        modulemd[PULP_MODULE_ATTR.CONTEXT],
                        modulemd[PULP_MODULE_ATTR.ARCH])
                    da = DeclarativeArtifact(artifact=artifact,
                                             relative_path=relative_path,
                                             url=modules_url)
                    modulemd_content = Modulemd(**modulemd)
                    dc = DeclarativeContent(content=modulemd_content,
                                            d_artifacts=[da])
                    dc.extra_data = defaultdict(list)

                    # dc.content.artifacts are Modulemd artifacts
                    for artifact in json.loads(dc.content.artifacts):
                        nevra_to_module.setdefault(artifact, set()).add(dc)
                    modulemd_list.append(dc)

                modulemd_default_names = parse_defaults(modulemd_index)

                modulemd_defaults_pb.total = len(modulemd_default_names)
                modulemd_defaults_pb.state = 'running'
                modulemd_defaults_pb.save()

                for default in modulemd_default_names:
                    artifact = default.pop('artifact')
                    relative_path = '{}{}snippet'.format(
                        default[PULP_MODULEDEFAULTS_ATTR.MODULE],
                        default[PULP_MODULEDEFAULTS_ATTR.STREAM])
                    da = DeclarativeArtifact(artifact=artifact,
                                             relative_path=relative_path,
                                             url=modules_url)
                    default_content = ModulemdDefaults(**default)
                    modulemd_defaults_pb.increment()
                    dc = DeclarativeContent(content=default_content,
                                            d_artifacts=[da])
                    await self.put(dc)

            if comps_downloader:
                comps_result = await comps_downloader.run()

                comps = libcomps.Comps()
                comps.fromxml_f(comps_result.path)

                comps_pb.total = (len(comps.groups) + len(comps.categories) +
                                  len(comps.environments))
                comps_pb.state = 'running'
                comps_pb.save()

                if comps.langpacks:
                    langpack_dict = PackageLangpacks.libcomps_to_dict(
                        comps.langpacks)
                    packagelangpack = PackageLangpacks(
                        matches=strdict_to_dict(comps.langpacks),
                        digest=dict_digest(langpack_dict))
                    dc = DeclarativeContent(content=packagelangpack)
                    dc.extra_data = defaultdict(list)
                    await self.put(dc)

                if comps.categories:
                    for category in comps.categories:
                        category_dict = PackageCategory.libcomps_to_dict(
                            category)
                        category_dict['digest'] = dict_digest(category_dict)
                        packagecategory = PackageCategory(**category_dict)
                        dc = DeclarativeContent(content=packagecategory)
                        dc.extra_data = defaultdict(list)

                        if packagecategory.group_ids:
                            for group_id in packagecategory.group_ids:
                                group_to_categories[group_id['name']].append(
                                    dc)
                        dc_categories.append(dc)

                if comps.environments:
                    for environment in comps.environments:
                        environment_dict = PackageEnvironment.libcomps_to_dict(
                            environment)
                        environment_dict['digest'] = dict_digest(
                            environment_dict)
                        packageenvironment = PackageEnvironment(
                            **environment_dict)
                        dc = DeclarativeContent(content=packageenvironment)
                        dc.extra_data = defaultdict(list)

                        if packageenvironment.option_ids:
                            for option_id in packageenvironment.option_ids:
                                optionalgroup_to_environments[
                                    option_id['name']].append(dc)

                        if packageenvironment.group_ids:
                            for group_id in packageenvironment.group_ids:
                                group_to_environments[group_id['name']].append(
                                    dc)

                        dc_environments.append(dc)

                if comps.groups:
                    for group in comps.groups:
                        group_dict = PackageGroup.libcomps_to_dict(group)
                        group_dict['digest'] = dict_digest(group_dict)
                        packagegroup = PackageGroup(**group_dict)
                        dc = DeclarativeContent(content=packagegroup)
                        dc.extra_data = defaultdict(list)

                        if packagegroup.packages:
                            for package in packagegroup.packages:
                                pkgname_to_groups[package['name']].append(dc)

                        if dc.content.id in group_to_categories.keys():
                            for dc_category in group_to_categories[
                                    dc.content.id]:
                                dc.extra_data['category_relations'].append(
                                    dc_category)
                                dc_category.extra_data['packagegroups'].append(
                                    dc)

                        if dc.content.id in group_to_environments.keys():
                            for dc_environment in group_to_environments[
                                    dc.content.id]:
                                dc.extra_data['environment_relations'].append(
                                    dc_environment)
                                dc_environment.extra_data[
                                    'packagegroups'].append(dc)

                        if dc.content.id in optionalgroup_to_environments.keys(
                        ):
                            for dc_environment in optionalgroup_to_environments[
                                    dc.content.id]:
                                dc.extra_data['env_relations_optional'].append(
                                    dc_environment)
                                dc_environment.extra_data[
                                    'optionalgroups'].append(dc)

                        dc_groups.append(dc)

                for dc_category in dc_categories:
                    comps_pb.increment()
                    await self.put(dc_category)

                for dc_environment in dc_environments:
                    comps_pb.increment()
                    await self.put(dc_environment)

            # to preserve order, downloaders are created after all repodata urls are identified
            package_repodata_downloaders = []
            for repodata_type in PACKAGE_REPODATA:
                downloader = self.remote.get_downloader(
                    url=package_repodata_urls[repodata_type])
                package_repodata_downloaders.append(downloader.run())

            downloaders.append(package_repodata_downloaders)

            # asyncio.gather is used to preserve the order of results for package repodata
            pending = [
                asyncio.gather(*downloaders_group)
                for downloaders_group in downloaders
            ]

            while pending:
                done, pending = await asyncio.wait(
                    pending, return_when=asyncio.FIRST_COMPLETED)
                for downloader in done:
                    results = downloader.result()
                    if results[0].url == package_repodata_urls['primary']:
                        primary_xml_path = results[0].path
                        filelists_xml_path = results[1].path
                        other_xml_path = results[2].path
                        metadata_pb.done += 3
                        metadata_pb.save()

                        packages = await RpmFirstStage.parse_repodata(
                            primary_xml_path, filelists_xml_path,
                            other_xml_path)
                        packages_pb.total = len(packages)
                        packages_pb.state = 'running'
                        packages_pb.save()

                        for pkg in packages.values():
                            package = Package(
                                **Package.createrepo_to_dict(pkg))
                            artifact = Artifact(size=package.size_package)
                            checksum_type = getattr(
                                CHECKSUM_TYPES, package.checksum_type.upper())
                            setattr(artifact, checksum_type, package.pkgId)
                            url = urljoin(remote_url, package.location_href)
                            filename = os.path.basename(package.location_href)
                            da = DeclarativeArtifact(
                                artifact=artifact,
                                url=url,
                                relative_path=filename,
                                remote=self.remote,
                                deferred_download=self.deferred_download)
                            dc = DeclarativeContent(content=package,
                                                    d_artifacts=[da])
                            dc.extra_data = defaultdict(list)

                            # find if a package relates to a modulemd
                            if dc.content.nevra in nevra_to_module.keys():
                                dc.content.is_modular = True
                                for dc_modulemd in nevra_to_module[
                                        dc.content.nevra]:
                                    dc.extra_data['modulemd_relation'].append(
                                        dc_modulemd)
                                    dc_modulemd.extra_data[
                                        'package_relation'].append(dc)

                            if dc.content.name in pkgname_to_groups.keys():
                                for dc_group in pkgname_to_groups[
                                        dc.content.name]:
                                    dc.extra_data['group_relations'].append(
                                        dc_group)
                                    dc_group.extra_data[
                                        'related_packages'].append(dc)

                            packages_pb.increment()
                            await self.put(dc)

                    elif results[0].url == updateinfo_url:
                        updateinfo_xml_path = results[0].path
                        metadata_pb.increment()

                        updates = await RpmFirstStage.parse_updateinfo(
                            updateinfo_xml_path)

                        errata_pb.total = len(updates)
                        errata_pb.state = 'running'
                        errata_pb.save()

                        for update in updates:
                            update_record = UpdateRecord(
                                **UpdateRecord.createrepo_to_dict(update))
                            update_record.digest = RpmFirstStage.hash_update_record(
                                update)
                            future_relations = {
                                'collections': defaultdict(list),
                                'references': []
                            }

                            for collection in update.collections:
                                coll_dict = UpdateCollection.createrepo_to_dict(
                                    collection)
                                coll = UpdateCollection(**coll_dict)

                                for package in collection.packages:
                                    pkg_dict = UpdateCollectionPackage.createrepo_to_dict(
                                        package)
                                    pkg = UpdateCollectionPackage(**pkg_dict)
                                    future_relations['collections'][
                                        coll].append(pkg)

                            for reference in update.references:
                                reference_dict = UpdateReference.createrepo_to_dict(
                                    reference)
                                ref = UpdateReference(**reference_dict)
                                future_relations['references'].append(ref)

                            errata_pb.increment()
                            dc = DeclarativeContent(content=update_record)
                            dc.extra_data = future_relations
                            await self.put(dc)

            # now send modules down the pipeline since all relations have been set up
            for modulemd in modulemd_list:
                modulemd_pb.increment()
                await self.put(modulemd)

            for dc_group in dc_groups:
                comps_pb.increment()
                await self.put(dc_group)

        packages_pb.state = 'completed'
        errata_pb.state = 'completed'
        modulemd_pb.state = 'completed'
        modulemd_defaults_pb.state = 'completed'
        comps_pb.state = 'completed'
        packages_pb.save()
        errata_pb.save()
        modulemd_pb.save()
        modulemd_defaults_pb.save()
        comps_pb.save()