Esempio n. 1
0
    def _init_dc_groups(self, comps):
        if comps.groups:
            for group in comps.groups:
                group_dict = PackageGroup.libcomps_to_dict(group)
                group_dict["digest"] = dict_digest(group_dict)
                packagegroup = PackageGroup(**group_dict)
                dc = DeclarativeContent(content=packagegroup)
                dc.extra_data = defaultdict(list)

                if packagegroup.packages:
                    for package in packagegroup.packages:
                        self.data.pkgname_to_groups[package["name"]].append(dc)

                if dc.content.id in self.group_to_categories.keys():
                    for dc_category in self.group_to_categories[dc.content.id]:
                        dc.extra_data["category_relations"].append(dc_category)
                        dc_category.extra_data["packagegroups"].append(dc)

                if dc.content.id in self.group_to_environments.keys():
                    for dc_environment in self.group_to_environments[
                            dc.content.id]:
                        dc.extra_data["environment_relations"].append(
                            dc_environment)
                        dc_environment.extra_data["packagegroups"].append(dc)

                if dc.content.id in self.optionalgroup_to_environments.keys():
                    for dc_environment in self.optionalgroup_to_environments[
                            dc.content.id]:
                        dc.extra_data["env_relations_optional"].append(
                            dc_environment)
                        dc_environment.extra_data["optionalgroups"].append(dc)

                self.data.dc_groups.append(dc)
Esempio n. 2
0
    async def parse_distribution_tree(self):
        """Parse content from the file treeinfo if present."""
        if self.treeinfo:
            d_artifacts = [
                DeclarativeArtifact(
                    artifact=Artifact(),
                    url=urljoin(self.data.remote_url,
                                self.treeinfo["filename"]),
                    relative_path=".treeinfo",
                    remote=self.remote,
                    deferred_download=False,
                )
            ]
            for path, checksum in self.treeinfo["download"]["images"].items():
                artifact = Artifact(**checksum)
                da = DeclarativeArtifact(
                    artifact=artifact,
                    url=urljoin(self.data.remote_url, path),
                    relative_path=path,
                    remote=self.remote,
                    deferred_download=self.deferred_download,
                )
                d_artifacts.append(da)

            distribution_tree = DistributionTree(
                **self.treeinfo["distribution_tree"])
            dc = DeclarativeContent(content=distribution_tree,
                                    d_artifacts=d_artifacts)
            dc.extra_data = self.treeinfo
            await self.put(dc)
Esempio n. 3
0
    async def migrate_to_pulp3(self, batch, pb=None):
        """
        Docker specific implementation of DeclarativeContent creation for migrating
        docker content to Pulp 3.

        Args:
            batch: A batch of Pulp2Content objects to migrate to Pulp 3
        """

        for pulp2content in batch:
            pulp_2to3_detail_content = pulp2content.detail_model
            pulp3content = pulp_2to3_detail_content.create_pulp3_content()
            future_relations = {'pulp2content': pulp2content}
            # store digests for future pulp3 content relations
            if pulp_2to3_detail_content.type == 'docker_manifest':

                future_relations['blob_rel'] = pulp_2to3_detail_content.blobs
                future_relations[
                    'config_blob_rel'] = pulp_2to3_detail_content.config_blob

            if pulp_2to3_detail_content.type == 'docker_manifest_list':

                future_relations[
                    'man_rel'] = pulp_2to3_detail_content.listed_manifests

            if pulp_2to3_detail_content.type == 'docker_tag':

                future_relations[
                    'tag_rel'] = pulp_2to3_detail_content.tagged_manifest

            if pulp_2to3_detail_content.type == 'docker_tag':
                # dc without artifact, will assign arifact in the _pre_save hook
                dc = DeclarativeContent(content=pulp3content)
            else:
                artifact = await self.create_artifact(
                    pulp2content.pulp2_storage_path,
                    pulp_2to3_detail_content.expected_digests,
                    pulp_2to3_detail_content.expected_size)
                da = DeclarativeArtifact(
                    artifact=artifact,
                    url=NOT_USED,
                    relative_path=pulp_2to3_detail_content.
                    relative_path_for_content_artifact,
                    remote=NOT_USED,
                    deferred_download=False)
                dc = DeclarativeContent(content=pulp3content,
                                        d_artifacts=[da],
                                        does_batch=False)

            dc.extra_data = future_relations
            await self.put(dc)
            if pb:
                pb.increment()
Esempio n. 4
0
    def _init_dc_categories(self, comps):
        if comps.categories:
            for category in comps.categories:
                category_dict = PackageCategory.libcomps_to_dict(category)
                category_dict["digest"] = dict_digest(category_dict)
                packagecategory = PackageCategory(**category_dict)
                dc = DeclarativeContent(content=packagecategory)
                dc.extra_data = defaultdict(list)

                if packagecategory.group_ids:
                    for group_id in packagecategory.group_ids:
                        self.group_to_categories[group_id["name"]].append(dc)
                self.dc_categories.append(dc)
Esempio n. 5
0
    async def _parse_packages(self, packages):
        progress_data = {
            "message": "Parsed Packages",
            "code": "sync.parsing.packages",
            "total": len(packages),
        }

        with ProgressReport(**progress_data) as packages_pb:
            while True:
                try:
                    (_, pkg) = packages.popitem(last=False)
                except KeyError:
                    break
                package = Package(**Package.createrepo_to_dict(pkg))
                del pkg
                artifact = Artifact(size=package.size_package)
                checksum_type = getattr(CHECKSUM_TYPES,
                                        package.checksum_type.upper())
                setattr(artifact, checksum_type, package.pkgId)
                url = urlpath_sanitize(self.data.remote_url,
                                       package.location_href)
                filename = os.path.basename(package.location_href)
                da = DeclarativeArtifact(
                    artifact=artifact,
                    url=url,
                    relative_path=filename,
                    remote=self.remote,
                    deferred_download=self.deferred_download,
                )
                dc = DeclarativeContent(content=package, d_artifacts=[da])
                dc.extra_data = defaultdict(list)

                # find if a package relates to a modulemd
                if dc.content.nevra in self.data.nevra_to_module.keys():
                    dc.content.is_modular = True
                    for dc_modulemd in self.data.nevra_to_module[
                            dc.content.nevra]:
                        dc.extra_data["modulemd_relation"].append(dc_modulemd)
                        dc_modulemd.extra_data["package_relation"].append(dc)

                if dc.content.name in self.data.pkgname_to_groups.keys():
                    for dc_group in self.data.pkgname_to_groups[
                            dc.content.name]:
                        dc.extra_data["group_relations"].append(dc_group)
                        dc_group.extra_data["related_packages"].append(dc)

                packages_pb.increment()
                await self.put(dc)
Esempio n. 6
0
    def _init_dc_environments(self, comps):
        if comps.environments:
            for environment in comps.environments:
                environment_dict = PackageEnvironment.libcomps_to_dict(
                    environment)
                environment_dict["digest"] = dict_digest(environment_dict)
                packageenvironment = PackageEnvironment(**environment_dict)
                dc = DeclarativeContent(content=packageenvironment)
                dc.extra_data = defaultdict(list)

                if packageenvironment.option_ids:
                    for option_id in packageenvironment.option_ids:
                        self.optionalgroup_to_environments[
                            option_id["name"]].append(dc)

                if packageenvironment.group_ids:
                    for group_id in packageenvironment.group_ids:
                        self.group_to_environments[group_id["name"]].append(dc)

                self.dc_environments.append(dc)
Esempio n. 7
0
    async def _parse_packages(self, packages):
        progress_data = {
            'message': 'Parsed Packages',
            'code': 'parsing.packages',
            'total': len(packages),
        }

        with ProgressReport(**progress_data) as packages_pb:
            for pkg in packages.values():
                package = Package(**Package.createrepo_to_dict(pkg))
                artifact = Artifact(size=package.size_package)
                checksum_type = getattr(CHECKSUM_TYPES,
                                        package.checksum_type.upper())
                setattr(artifact, checksum_type, package.pkgId)
                url = urljoin(self.data.remote_url, package.location_href)
                filename = os.path.basename(package.location_href)
                da = DeclarativeArtifact(
                    artifact=artifact,
                    url=url,
                    relative_path=filename,
                    remote=self.remote,
                    deferred_download=self.deferred_download)
                dc = DeclarativeContent(content=package, d_artifacts=[da])
                dc.extra_data = defaultdict(list)

                # find if a package relates to a modulemd
                if dc.content.nevra in self.data.nevra_to_module.keys():
                    dc.content.is_modular = True
                    for dc_modulemd in self.data.nevra_to_module[
                            dc.content.nevra]:
                        dc.extra_data['modulemd_relation'].append(dc_modulemd)
                        dc_modulemd.extra_data['package_relation'].append(dc)

                if dc.content.name in self.data.pkgname_to_groups.keys():
                    for dc_group in self.data.pkgname_to_groups[
                            dc.content.name]:
                        dc.extra_data['group_relations'].append(dc_group)
                        dc_group.extra_data['related_packages'].append(dc)

                packages_pb.increment()
                await self.put(dc)
Esempio n. 8
0
    def _parse_modulemd_list(self, modulemd_index):
        modulemd_names = modulemd_index.get_module_names() or []
        modulemd_all = parse_modulemd(modulemd_names, modulemd_index)

        # Parsing modules happens all at one time, and from here on no useful work happens.
        # So just report that it finished this stage.
        modulemd_pb_data = {
            "message": "Parsed Modulemd",
            "code": "parsing.modulemds"
        }
        with ProgressReport(**modulemd_pb_data) as modulemd_pb:
            modulemd_total = len(modulemd_all)
            modulemd_pb.total = modulemd_total
            modulemd_pb.done = modulemd_total

        for modulemd in modulemd_all:
            artifact = modulemd.pop("artifact")
            relative_path = "{}{}{}{}{}snippet".format(
                modulemd[PULP_MODULE_ATTR.NAME],
                modulemd[PULP_MODULE_ATTR.STREAM],
                modulemd[PULP_MODULE_ATTR.VERSION],
                modulemd[PULP_MODULE_ATTR.CONTEXT],
                modulemd[PULP_MODULE_ATTR.ARCH],
            )
            da = DeclarativeArtifact(artifact=artifact,
                                     relative_path=relative_path,
                                     url=self.data.modules_url)
            modulemd_content = Modulemd(**modulemd)
            dc = DeclarativeContent(content=modulemd_content, d_artifacts=[da])
            dc.extra_data = defaultdict(list)

            # dc.content.artifacts are Modulemd artifacts
            for artifact in dc.content.artifacts:
                self.data.nevra_to_module.setdefault(artifact, set()).add(dc)
            self.data.modulemd_list.append(dc)

        # delete list now that we're done with it for memory savings
        del modulemd_all
Esempio n. 9
0
    async def migrate_to_pulp3(self, batch, pb=None):
        """
        A default implementation of DeclarativeContent creation for migrating content to Pulp 3.

        Plugin writers might want to override this method if it doesn't satisfy their needs as is.

        Args:
            batch: A batch of Pulp2Content objects to migrate to Pulp 3
        """
        for pulp2content in batch:
            pulp_2to3_detail_content = pulp2content.detail_model.get()
            pulp3content = pulp_2to3_detail_content.create_pulp3_content()
            future_relations = {'pulp2content': pulp2content}

            if not pulp2content.downloaded:
                # on_demand content is partially migrated - only Content is created at this stage.
                # Remote Artifact and Content Artifact should be created at the time of
                # importers/remotes migration. Rely on downloaded flag on Pulp2Content to
                # identify on_demand content.
                dc = DeclarativeContent(content=pulp3content)
            else:
                artifact = await self.create_artifact(
                    pulp2content.pulp2_storage_path,
                    pulp_2to3_detail_content.expected_digests,
                    pulp_2to3_detail_content.expected_size)
                da = DeclarativeArtifact(
                    artifact=artifact,
                    url=NOT_USED,
                    relative_path=pulp_2to3_detail_content.
                    relative_path_for_content_artifact,
                    remote=NOT_USED,
                    deferred_download=False)
                dc = DeclarativeContent(content=pulp3content, d_artifacts=[da])

            dc.extra_data = future_relations
            await self.put(dc)
            if pb:
                pb.increment()
Esempio n. 10
0
    async def _parse_advisories(self, updates):
        progress_data = {
            "message": "Parsed Advisories",
            "code": "parsing.advisories",
            "total": len(updates),
        }
        with ProgressReport(**progress_data) as advisories_pb:
            for update in updates:
                update_record = UpdateRecord(
                    **UpdateRecord.createrepo_to_dict(update))
                update_record.digest = hash_update_record(update)
                future_relations = {
                    "collections": defaultdict(list),
                    "references": []
                }

                for collection in update.collections:
                    coll_dict = UpdateCollection.createrepo_to_dict(collection)
                    coll = UpdateCollection(**coll_dict)

                    for package in collection.packages:
                        pkg_dict = UpdateCollectionPackage.createrepo_to_dict(
                            package)
                        pkg = UpdateCollectionPackage(**pkg_dict)
                        future_relations["collections"][coll].append(pkg)

                for reference in update.references:
                    reference_dict = UpdateReference.createrepo_to_dict(
                        reference)
                    ref = UpdateReference(**reference_dict)
                    future_relations["references"].append(ref)

                advisories_pb.increment()
                dc = DeclarativeContent(content=update_record)
                dc.extra_data = future_relations
                await self.put(dc)
Esempio n. 11
0
    async def run(self):
        """
        Build `DeclarativeContent` from the repodata.
        """
        remote_url = self.new_url or self.remote.url
        remote_url = remote_url if remote_url[-1] == "/" else f"{remote_url}/"
        optimize_sync = self.optimize

        progress_data = dict(message='Downloading Metadata Files',
                             code='downloading.metadata')
        with ProgressReport(**progress_data) as metadata_pb:
            downloader = self.remote.get_downloader(
                url=urljoin(remote_url, 'repodata/repomd.xml'))
            # TODO: decide how to distinguish between a mirror list and a normal repo
            result = await downloader.run()
            metadata_pb.increment()

            repomd_path = result.path
            repomd = cr.Repomd(repomd_path)

            # Caution: we are not storing when the remote was last updated, so the order of this
            # logic must remain in this order where we first check the version number as other
            # changes than sync could have taken place such that the date or repo version will be
            # different from last sync
            if (optimize_sync and self.repository.last_sync_remote
                    and self.remote.pk == self.repository.last_sync_remote.pk
                    and (self.repository.last_sync_repo_version
                         == self.repository.latest_version().number)
                    and (self.remote.pulp_last_updated <=
                         self.repository.latest_version().pulp_created)
                    and is_previous_version(
                        repomd.revision,
                        self.repository.last_sync_revision_number)):
                optimize_data = dict(message='Optimizing Sync',
                                     code='optimizing.sync')
                with ProgressReport(**optimize_data) as optimize_pb:
                    optimize_pb.done = 1
                    optimize_pb.save()
                    return

            self.repository.last_sync_revision_number = repomd.revision

            if self.treeinfo:
                d_artifacts = [
                    DeclarativeArtifact(
                        artifact=Artifact(),
                        url=urljoin(remote_url, self.treeinfo["filename"]),
                        relative_path=".treeinfo",
                        remote=self.remote,
                        deferred_download=False,
                    )
                ]
                for path, checksum in self.treeinfo["download"][
                        "images"].items():
                    artifact = Artifact(**checksum)
                    da = DeclarativeArtifact(
                        artifact=artifact,
                        url=urljoin(remote_url, path),
                        relative_path=path,
                        remote=self.remote,
                        deferred_download=self.deferred_download)
                    d_artifacts.append(da)

                distribution_tree = DistributionTree(
                    **self.treeinfo["distribution_tree"])
                dc = DeclarativeContent(content=distribution_tree,
                                        d_artifacts=d_artifacts)
                dc.extra_data = self.treeinfo
                await self.put(dc)

            package_repodata_urls = {}
            downloaders = []
            modulemd_list = list()
            dc_groups = []
            dc_categories = []
            dc_environments = []
            nevra_to_module = defaultdict(dict)
            pkgname_to_groups = defaultdict(list)
            group_to_categories = defaultdict(list)
            group_to_environments = defaultdict(list)
            optionalgroup_to_environments = defaultdict(list)
            modulemd_results = None
            comps_downloader = None
            main_types = set()
            checksums = {}

            for record in repomd.records:
                checksums[record.type] = record.checksum_type.upper()
                if record.type in PACKAGE_REPODATA:
                    main_types.update([record.type])
                    package_repodata_urls[record.type] = urljoin(
                        remote_url, record.location_href)

                elif record.type in UPDATE_REPODATA:
                    updateinfo_url = urljoin(remote_url, record.location_href)
                    downloader = self.remote.get_downloader(url=updateinfo_url)
                    downloaders.append([downloader.run()])

                elif record.type in COMPS_REPODATA:
                    comps_url = urljoin(remote_url, record.location_href)
                    comps_downloader = self.remote.get_downloader(
                        url=comps_url)

                elif record.type in SKIP_REPODATA:
                    continue

                elif '_zck' in record.type:
                    continue

                elif record.type in MODULAR_REPODATA:
                    modules_url = urljoin(remote_url, record.location_href)
                    modulemd_downloader = self.remote.get_downloader(
                        url=modules_url)
                    modulemd_results = await modulemd_downloader.run()

                elif record.type not in PACKAGE_DB_REPODATA:
                    file_data = {
                        record.checksum_type: record.checksum,
                        "size": record.size
                    }
                    da = DeclarativeArtifact(
                        artifact=Artifact(**file_data),
                        url=urljoin(remote_url, record.location_href),
                        relative_path=record.location_href,
                        remote=self.remote,
                        deferred_download=False)
                    repo_metadata_file = RepoMetadataFile(
                        data_type=record.type,
                        checksum_type=record.checksum_type,
                        checksum=record.checksum,
                    )
                    dc = DeclarativeContent(content=repo_metadata_file,
                                            d_artifacts=[da])
                    await self.put(dc)

            missing_type = set(PACKAGE_REPODATA) - main_types
            if missing_type:
                raise FileNotFoundError(
                    _("XML file(s): {filename} not found").format(
                        filename=", ".join(missing_type)))

            self.repository.original_checksum_types = checksums

            # we have to sync module.yaml first if it exists, to make relations to packages
            if modulemd_results:
                modulemd_index = mmdlib.ModuleIndex.new()
                open_func = gzip.open if modulemd_results.url.endswith(
                    '.gz') else open
                with open_func(modulemd_results.path, 'r') as moduleyaml:
                    content = moduleyaml.read()
                    module_content = content if isinstance(
                        content, str) else content.decode()
                    modulemd_index.update_from_string(module_content, True)

                modulemd_names = modulemd_index.get_module_names() or []
                modulemd_all = parse_modulemd(modulemd_names, modulemd_index)

                # Parsing modules happens all at one time, and from here on no useful work happens.
                # So just report that it finished this stage.
                modulemd_pb_data = {
                    'message': 'Parsed Modulemd',
                    'code': 'parsing.modulemds'
                }
                with ProgressReport(**modulemd_pb_data) as modulemd_pb:
                    modulemd_total = len(modulemd_all)
                    modulemd_pb.total = modulemd_total
                    modulemd_pb.done = modulemd_total

                for modulemd in modulemd_all:
                    artifact = modulemd.pop('artifact')
                    relative_path = '{}{}{}{}{}snippet'.format(
                        modulemd[PULP_MODULE_ATTR.NAME],
                        modulemd[PULP_MODULE_ATTR.STREAM],
                        modulemd[PULP_MODULE_ATTR.VERSION],
                        modulemd[PULP_MODULE_ATTR.CONTEXT],
                        modulemd[PULP_MODULE_ATTR.ARCH])
                    da = DeclarativeArtifact(artifact=artifact,
                                             relative_path=relative_path,
                                             url=modules_url)
                    modulemd_content = Modulemd(**modulemd)
                    dc = DeclarativeContent(content=modulemd_content,
                                            d_artifacts=[da])
                    dc.extra_data = defaultdict(list)

                    # dc.content.artifacts are Modulemd artifacts
                    for artifact in dc.content.artifacts:
                        nevra_to_module.setdefault(artifact, set()).add(dc)
                    modulemd_list.append(dc)

                # delete list now that we're done with it for memory savings
                del modulemd_all

                modulemd_default_names = parse_defaults(modulemd_index)

                # Parsing module-defaults happens all at one time, and from here on no useful
                # work happens. So just report that it finished this stage.
                modulemd_defaults_pb_data = {
                    'message': 'Parsed Modulemd-defaults',
                    'code': 'parsing.modulemd_defaults'
                }
                with ProgressReport(
                        **modulemd_defaults_pb_data) as modulemd_defaults_pb:
                    modulemd_defaults_total = len(modulemd_default_names)
                    modulemd_defaults_pb.total = modulemd_defaults_total
                    modulemd_defaults_pb.done = modulemd_defaults_total

                for default in modulemd_default_names:
                    artifact = default.pop('artifact')
                    relative_path = '{}{}snippet'.format(
                        default[PULP_MODULEDEFAULTS_ATTR.MODULE],
                        default[PULP_MODULEDEFAULTS_ATTR.STREAM])
                    da = DeclarativeArtifact(artifact=artifact,
                                             relative_path=relative_path,
                                             url=modules_url)
                    default_content = ModulemdDefaults(**default)
                    dc = DeclarativeContent(content=default_content,
                                            d_artifacts=[da])
                    await self.put(dc)

                # delete list now that we're done with it for memory savings
                del modulemd_default_names

            if comps_downloader:
                comps_result = await comps_downloader.run()

                comps = libcomps.Comps()
                comps.fromxml_f(comps_result.path)

                with ProgressReport(message='Parsed Comps',
                                    code='parsing.comps') as comps_pb:
                    comps_total = (len(comps.groups) + len(comps.categories) +
                                   len(comps.environments))
                    comps_pb.total = comps_total
                    comps_pb.done = comps_total

                if comps.langpacks:
                    langpack_dict = PackageLangpacks.libcomps_to_dict(
                        comps.langpacks)
                    packagelangpack = PackageLangpacks(
                        matches=strdict_to_dict(comps.langpacks),
                        digest=dict_digest(langpack_dict))
                    dc = DeclarativeContent(content=packagelangpack)
                    dc.extra_data = defaultdict(list)
                    await self.put(dc)

                if comps.categories:
                    for category in comps.categories:
                        category_dict = PackageCategory.libcomps_to_dict(
                            category)
                        category_dict['digest'] = dict_digest(category_dict)
                        packagecategory = PackageCategory(**category_dict)
                        dc = DeclarativeContent(content=packagecategory)
                        dc.extra_data = defaultdict(list)

                        if packagecategory.group_ids:
                            for group_id in packagecategory.group_ids:
                                group_to_categories[group_id['name']].append(
                                    dc)
                        dc_categories.append(dc)

                if comps.environments:
                    for environment in comps.environments:
                        environment_dict = PackageEnvironment.libcomps_to_dict(
                            environment)
                        environment_dict['digest'] = dict_digest(
                            environment_dict)
                        packageenvironment = PackageEnvironment(
                            **environment_dict)
                        dc = DeclarativeContent(content=packageenvironment)
                        dc.extra_data = defaultdict(list)

                        if packageenvironment.option_ids:
                            for option_id in packageenvironment.option_ids:
                                optionalgroup_to_environments[
                                    option_id['name']].append(dc)

                        if packageenvironment.group_ids:
                            for group_id in packageenvironment.group_ids:
                                group_to_environments[group_id['name']].append(
                                    dc)

                        dc_environments.append(dc)

                if comps.groups:
                    for group in comps.groups:
                        group_dict = PackageGroup.libcomps_to_dict(group)
                        group_dict['digest'] = dict_digest(group_dict)
                        packagegroup = PackageGroup(**group_dict)
                        dc = DeclarativeContent(content=packagegroup)
                        dc.extra_data = defaultdict(list)

                        if packagegroup.packages:
                            for package in packagegroup.packages:
                                pkgname_to_groups[package['name']].append(dc)

                        if dc.content.id in group_to_categories.keys():
                            for dc_category in group_to_categories[
                                    dc.content.id]:
                                dc.extra_data['category_relations'].append(
                                    dc_category)
                                dc_category.extra_data['packagegroups'].append(
                                    dc)

                        if dc.content.id in group_to_environments.keys():
                            for dc_environment in group_to_environments[
                                    dc.content.id]:
                                dc.extra_data['environment_relations'].append(
                                    dc_environment)
                                dc_environment.extra_data[
                                    'packagegroups'].append(dc)

                        if dc.content.id in optionalgroup_to_environments.keys(
                        ):
                            for dc_environment in optionalgroup_to_environments[
                                    dc.content.id]:
                                dc.extra_data['env_relations_optional'].append(
                                    dc_environment)
                                dc_environment.extra_data[
                                    'optionalgroups'].append(dc)

                        dc_groups.append(dc)

                for dc_category in dc_categories:
                    await self.put(dc_category)

                for dc_environment in dc_environments:
                    await self.put(dc_environment)

            # delete lists now that we're done with them for memory savings
            del dc_environments
            del dc_categories

            # to preserve order, downloaders are created after all repodata urls are identified
            package_repodata_downloaders = []
            for repodata_type in PACKAGE_REPODATA:
                downloader = self.remote.get_downloader(
                    url=package_repodata_urls[repodata_type])
                package_repodata_downloaders.append(downloader.run())

            downloaders.append(package_repodata_downloaders)

            # asyncio.gather is used to preserve the order of results for package repodata
            pending = [
                asyncio.gather(*downloaders_group)
                for downloaders_group in downloaders
            ]

            while pending:
                done, pending = await asyncio.wait(
                    pending, return_when=asyncio.FIRST_COMPLETED)
                for downloader in done:
                    try:
                        results = downloader.result()
                    except ClientResponseError as exc:
                        raise HTTPNotFound(
                            reason=_("File not found: {filename}").format(
                                filename=exc.request_info.url))
                    if results[0].url == package_repodata_urls['primary']:
                        primary_xml_path = results[0].path
                        filelists_xml_path = results[1].path
                        other_xml_path = results[2].path
                        metadata_pb.done += 3
                        metadata_pb.save()

                        packages = await RpmFirstStage.parse_repodata(
                            primary_xml_path, filelists_xml_path,
                            other_xml_path)
                        # skip SRPM if defined
                        if 'srpm' in self.skip_types:
                            packages = {
                                pkgId: pkg
                                for pkgId, pkg in packages.items()
                                if pkg.arch != 'src'
                            }

                        progress_data = {
                            'message': 'Parsed Packages',
                            'code': 'parsing.packages',
                            'total': len(packages),
                        }
                        with ProgressReport(**progress_data) as packages_pb:
                            for pkg in packages.values():
                                package = Package(
                                    **Package.createrepo_to_dict(pkg))
                                artifact = Artifact(size=package.size_package)
                                checksum_type = getattr(
                                    CHECKSUM_TYPES,
                                    package.checksum_type.upper())
                                setattr(artifact, checksum_type, package.pkgId)
                                url = urljoin(remote_url,
                                              package.location_href)
                                filename = os.path.basename(
                                    package.location_href)
                                da = DeclarativeArtifact(
                                    artifact=artifact,
                                    url=url,
                                    relative_path=filename,
                                    remote=self.remote,
                                    deferred_download=self.deferred_download)
                                dc = DeclarativeContent(content=package,
                                                        d_artifacts=[da])
                                dc.extra_data = defaultdict(list)

                                # find if a package relates to a modulemd
                                if dc.content.nevra in nevra_to_module.keys():
                                    dc.content.is_modular = True
                                    for dc_modulemd in nevra_to_module[
                                            dc.content.nevra]:
                                        dc.extra_data[
                                            'modulemd_relation'].append(
                                                dc_modulemd)
                                        dc_modulemd.extra_data[
                                            'package_relation'].append(dc)

                                if dc.content.name in pkgname_to_groups.keys():
                                    for dc_group in pkgname_to_groups[
                                            dc.content.name]:
                                        dc.extra_data[
                                            'group_relations'].append(dc_group)
                                        dc_group.extra_data[
                                            'related_packages'].append(dc)

                                packages_pb.increment()
                                await self.put(dc)

                    elif results[0].url == updateinfo_url:
                        updateinfo_xml_path = results[0].path
                        metadata_pb.increment()

                        updates = await RpmFirstStage.parse_updateinfo(
                            updateinfo_xml_path)

                        progress_data = {
                            'message': 'Parsed Advisories',
                            'code': 'parsing.advisories',
                            'total': len(updates),
                        }
                        with ProgressReport(**progress_data) as advisories_pb:
                            for update in updates:
                                update_record = UpdateRecord(
                                    **UpdateRecord.createrepo_to_dict(update))
                                update_record.digest = hash_update_record(
                                    update)
                                future_relations = {
                                    'collections': defaultdict(list),
                                    'references': []
                                }

                                for collection in update.collections:
                                    coll_dict = UpdateCollection.createrepo_to_dict(
                                        collection)
                                    coll = UpdateCollection(**coll_dict)

                                    for package in collection.packages:
                                        pkg_dict = UpdateCollectionPackage.createrepo_to_dict(
                                            package)
                                        pkg = UpdateCollectionPackage(
                                            **pkg_dict)
                                        future_relations['collections'][
                                            coll].append(pkg)

                                for reference in update.references:
                                    reference_dict = UpdateReference.createrepo_to_dict(
                                        reference)
                                    ref = UpdateReference(**reference_dict)
                                    future_relations['references'].append(ref)

                                advisories_pb.increment()
                                dc = DeclarativeContent(content=update_record)
                                dc.extra_data = future_relations
                                await self.put(dc)

            # now send modules down the pipeline since all relations have been set up
            for modulemd in modulemd_list:
                await self.put(modulemd)

            for dc_group in dc_groups:
                await self.put(dc_group)
Esempio n. 12
0
    async def run(self):
        """
        Build `DeclarativeContent` from the repodata.
        """
        packages_pb = ProgressBar(message='Parsed Packages')
        erratum_pb = ProgressBar(message='Parsed Erratum')

        packages_pb.save()
        erratum_pb.save()

        with ProgressBar(message='Downloading Metadata Files') as metadata_pb:
            downloader = self.remote.get_downloader(
                url=urljoin(self.remote.url, 'repodata/repomd.xml'))
            # TODO: decide how to distinguish between a mirror list and a normal repo
            result = await downloader.run()
            metadata_pb.increment()

            repomd_path = result.path
            repomd = cr.Repomd(repomd_path)
            package_repodata_urls = {}
            downloaders = []

            for record in repomd.records:
                if record.type in PACKAGE_REPODATA:
                    package_repodata_urls[record.type] = urljoin(
                        self.remote.url, record.location_href)
                elif record.type in UPDATE_REPODATA:
                    updateinfo_url = urljoin(self.remote.url,
                                             record.location_href)
                    downloader = self.remote.get_downloader(url=updateinfo_url)
                    downloaders.append([downloader.run()])
                else:
                    log.info(
                        _('Unknown repodata type: {t}. Skipped.').format(
                            t=record.type))
                    # TODO: skip databases, save unknown types to publish them as-is

            # to preserve order, downloaders are created after all repodata urls are identified
            package_repodata_downloaders = []
            for repodata_type in PACKAGE_REPODATA:
                downloader = self.remote.get_downloader(
                    url=package_repodata_urls[repodata_type])
                package_repodata_downloaders.append(downloader.run())

            downloaders.append(package_repodata_downloaders)

            # asyncio.gather is used to preserve the order of results for package repodata
            pending = [
                asyncio.gather(*downloaders_group)
                for downloaders_group in downloaders
            ]

            while pending:
                done, pending = await asyncio.wait(
                    pending, return_when=asyncio.FIRST_COMPLETED)
                for downloader in done:
                    results = downloader.result()
                    if results[0].url == package_repodata_urls['primary']:
                        primary_xml_path = results[0].path
                        filelists_xml_path = results[1].path
                        other_xml_path = results[2].path
                        metadata_pb.done += 3
                        metadata_pb.save()

                        packages = await RpmFirstStage.parse_repodata(
                            primary_xml_path, filelists_xml_path,
                            other_xml_path)
                        packages_pb.total = len(packages)
                        packages_pb.state = 'running'
                        packages_pb.save()

                        for pkg in packages.values():
                            package = Package(
                                **Package.createrepo_to_dict(pkg))
                            artifact = Artifact(size=package.size_package)
                            checksum_type = getattr(
                                CHECKSUM_TYPES, package.checksum_type.upper())
                            setattr(artifact, checksum_type, package.pkgId)
                            url = urljoin(self.remote.url,
                                          package.location_href)
                            filename = os.path.basename(package.location_href)
                            da = DeclarativeArtifact(
                                artifact=artifact,
                                url=url,
                                relative_path=filename,
                                remote=self.remote,
                                deferred_download=self.deferred_download)
                            dc = DeclarativeContent(content=package,
                                                    d_artifacts=[da])
                            packages_pb.increment()
                            await self.put(dc)

                    elif results[0].url == updateinfo_url:
                        updateinfo_xml_path = results[0].path
                        metadata_pb.increment()

                        updates = await RpmFirstStage.parse_updateinfo(
                            updateinfo_xml_path)

                        erratum_pb.total = len(updates)
                        erratum_pb.state = 'running'
                        erratum_pb.save()

                        for update in updates:
                            update_record = UpdateRecord(
                                **UpdateRecord.createrepo_to_dict(update))
                            update_record.digest = RpmFirstStage.hash_update_record(
                                update)
                            future_relations = {
                                'collections': defaultdict(list),
                                'references': []
                            }

                            for collection in update.collections:
                                coll_dict = UpdateCollection.createrepo_to_dict(
                                    collection)
                                coll = UpdateCollection(**coll_dict)

                                for package in collection.packages:
                                    pkg_dict = UpdateCollectionPackage.createrepo_to_dict(
                                        package)
                                    pkg = UpdateCollectionPackage(**pkg_dict)
                                    future_relations['collections'][
                                        coll].append(pkg)

                            for reference in update.references:
                                reference_dict = UpdateReference.createrepo_to_dict(
                                    reference)
                                ref = UpdateReference(**reference_dict)
                                future_relations['references'].append(ref)

                            erratum_pb.increment()
                            dc = DeclarativeContent(content=update_record)
                            dc.extra_data = future_relations
                            await self.put(dc)

        packages_pb.state = 'completed'
        erratum_pb.state = 'completed'
        packages_pb.save()
        erratum_pb.save()
Esempio n. 13
0
    async def migrate_to_pulp3(self, content_model, content_type):
        """
        A default implementation of DeclarativeContent creation for migrating content to Pulp 3.

        Plugin writers might want to override this method if it doesn't satisfy their needs as is.

        In this implementation there is an assumption that each content has one artifact.

        Args:
            batch: A batch of Pulp2Content objects to migrate to Pulp 3
            migrator: A plugin migrator to be used
            content_type: type of pulp2 content that is being mirated
        """
        @functools.lru_cache(maxsize=20)
        def get_remote_by_importer_id(importer_id):
            """
            Args:
                importer_id(str): Id of an importer in Pulp 2

            Returns:
                remote(pulpcore.app.models.Remote): A corresponding remote in Pulp 3

            """
            try:
                pulp2importer = Pulp2Importer.objects.get(
                    pulp2_object_id=importer_id)
            except ObjectDoesNotExist:
                return
            return pulp2importer.pulp3_remote

        futures = []
        is_lazy_type = content_type in self.migrator.lazy_types
        is_artifactless_type = content_type in self.migrator.artifactless_types
        has_future = content_type in self.migrator.future_types
        is_multi_artifact = content_type in self.migrator.multi_artifact_types

        if is_lazy_type:
            # go through all of the content that haven't been migrated OR have been migrated
            # but have new lazy catalog entries.
            units_with_new_lces = Pulp2LazyCatalog.objects.filter(
                is_migrated=False).values('pulp2_unit_id').distinct()
            already_migrated = ~Q(pulp2content__pulp3_content=None)
            no_new_lces = ~Q(pulp2content__pulp2_id__in=units_with_new_lces)
            pulp_2to3_detail_qs = content_model.objects.exclude(
                already_migrated & no_new_lces)
        else:
            # go through all of the content that haven't been migrated
            pulp_2to3_detail_qs = content_model.objects.filter(
                pulp2content__pulp3_content=None)

        # order by pulp2_repo if it's set
        if content_model.set_pulp2_repo:
            pulp_2to3_detail_qs = pulp_2to3_detail_qs.order_by('repo_id')

        with ProgressReport(message='Migrating {} content to Pulp 3 {}'.format(
                self.migrator.pulp2_plugin, content_type),
                            code='migrating.{}.content'.format(
                                self.migrator.pulp2_plugin),
                            total=pulp_2to3_detail_qs.count()) as pb:
            select_extra = [
                'pulp2content',
                'pulp2content__pulp3_content',
            ]

            if content_model.set_pulp2_repo:
                select_extra.append('pulp2content__pulp2_repo')

            pulp_2to3_detail_qs = pulp_2to3_detail_qs.select_related(
                *select_extra)
            for pulp_2to3_detail_content in pulp_2to3_detail_qs.iterator(
                    chunk_size=800):
                dc = None
                pulp2content = pulp_2to3_detail_content.pulp2content
                # only content that supports on_demand download can have entries in LCE
                if is_lazy_type:
                    # get all Lazy Catalog Entries (LCEs) for this content
                    pulp2lazycatalog = Pulp2LazyCatalog.objects.filter(
                        pulp2_unit_id=pulp2content.pulp2_id,
                        is_migrated=False,
                    )

                    if not pulp2content.downloaded and not pulp2lazycatalog:
                        _logger.warn(
                            _('On_demand content cannot be migrated without an entry in the '
                              'lazy catalog, pulp2 unit_id: {}'.format(
                                  pulp2content.pulp2_id)))
                        continue

                if pulp2content.pulp3_content is not None and is_lazy_type and pulp2lazycatalog:
                    # find already created pulp3 content
                    pulp3content = pulp2content.pulp3_content
                    extra_info = None
                    if is_multi_artifact:
                        extra_info = pulp_2to3_detail_content.get_treeinfo_serialized(
                        )
                        # If we can't find the .treeinfo for the Distribution, warn and skip
                        if extra_info is None:
                            _logger.warning(
                                _("Failed to find or instantiate extra_info for multi-artifact "
                                  "pulp2 unit_id: {} ; skipping".format(
                                      pulp2content.pulp2_id)))
                            continue
                else:
                    # create pulp3 content and assign relations if present
                    pulp3content, extra_info = pulp_2to3_detail_content.create_pulp3_content(
                    )

                # If we can't find/create the Distribution, warn and skip
                if pulp3content is None:
                    _logger.warning(
                        _("Failed to find or instantiate pulp3 content for pulp2 unit_id: {} ;"
                          " skipping".format(pulp2content.pulp2_id)))
                    continue

                future_relations = {'pulp2content': pulp2content}
                if extra_info:
                    future_relations.update(extra_info)

                if is_multi_artifact:
                    d_artifacts = []
                    base_path = pulp2content.pulp2_storage_path
                    remotes = set()
                    missing_artifact = False

                    for image_relative_path in extra_info['download'][
                            'images']:
                        image_path = os.path.join(base_path,
                                                  image_relative_path)
                        downloaded = os.path.exists(image_path)
                        if downloaded:
                            artifact = await self.create_artifact(
                                image_path, None, None, downloaded=downloaded)
                            if artifact is None:
                                continue
                        else:
                            artifact = Artifact()

                        lces = pulp2lazycatalog.filter(
                            pulp2_storage_path=image_path)
                        if lces:
                            remote_declarative_artifacts = []

                            for lce in lces:
                                remote = get_remote_by_importer_id(
                                    lce.pulp2_importer_id)

                                if not remote and not downloaded:
                                    continue

                                remotes.add(remote)
                                da = DeclarativeArtifact(
                                    artifact=artifact,
                                    url=lce.pulp2_url,
                                    relative_path=image_relative_path,
                                    remote=remote,
                                    deferred_download=not downloaded)
                                remote_declarative_artifacts.append(da)

                            if not remote_declarative_artifacts:
                                missing_artifact = True
                                break

                            d_artifacts.extend(remote_declarative_artifacts)
                        else:
                            da = DeclarativeArtifact(
                                artifact=artifact,
                                url=NOT_USED,
                                relative_path=image_relative_path,
                                remote=None,
                                deferred_download=False)
                            d_artifacts.append(da)

                    if missing_artifact:
                        _logger.warn(
                            _('On_demand content cannot be migrated without a remote '
                              'pulp2 unit_id: {}'.format(
                                  pulp2content.pulp2_id)))
                        continue

                    for lce in pulp2lazycatalog:
                        lce.is_migrated = True
                    future_relations.update({'lces': list(pulp2lazycatalog)})

                    # We do this last because we need the remote url which is only found in the LCE
                    # of the image files. There is no LCE for the .treeinfo file itself.
                    relative_path = pulp_2to3_detail_content.relative_path_for_content_artifact
                    treeinfo_path = os.path.join(
                        pulp2content.pulp2_storage_path, relative_path)
                    artifact = await self.create_artifact(treeinfo_path,
                                                          None,
                                                          None,
                                                          downloaded=True)
                    if artifact is None:
                        continue
                    if remotes:
                        for remote in remotes:
                            da = DeclarativeArtifact(
                                artifact=artifact,
                                url=urljoin(remote.url, relative_path),
                                relative_path=relative_path,
                                remote=remote,
                                deferred_download=False,
                            )
                            d_artifacts.append(da)
                    else:
                        da = DeclarativeArtifact(
                            artifact=artifact,
                            url=NOT_USED,
                            relative_path=relative_path,
                            remote=None,
                            deferred_download=False,
                        )
                        d_artifacts.append(da)
                    dc = DeclarativeContent(content=pulp3content,
                                            d_artifacts=d_artifacts)
                    dc.extra_data = future_relations
                    await self.put(dc)
                # not all content units have files, create DC without artifact
                elif is_artifactless_type:
                    # dc without artifact
                    dc = DeclarativeContent(content=pulp3content)
                    dc.extra_data = future_relations
                    await self.put(dc)
                else:

                    # create artifact for content that has file
                    artifact = await self.create_artifact(
                        pulp2content.pulp2_storage_path,
                        pulp_2to3_detail_content.expected_digests,
                        pulp_2to3_detail_content.expected_size,
                        downloaded=pulp2content.downloaded)
                    if artifact is None:
                        continue

                    if is_lazy_type and pulp2lazycatalog:
                        # handle DA and RA creation for content that supports on_demand
                        # Downloaded or on_demand content with LCEs.
                        #
                        # To create multiple remote artifacts, create multiple instances of
                        # declarative content which will differ by url/remote in their
                        # declarative artifacts
                        at_least_one_lce_migrated = False
                        for lce in pulp2lazycatalog:
                            remote = get_remote_by_importer_id(
                                lce.pulp2_importer_id)
                            deferred_download = not pulp2content.downloaded
                            if not remote and deferred_download:
                                continue

                            relative_path = (
                                pulp_2to3_detail_content.
                                relative_path_for_content_artifact)
                            da = DeclarativeArtifact(
                                artifact=artifact,
                                url=lce.pulp2_url,
                                relative_path=relative_path,
                                remote=remote,
                                deferred_download=deferred_download)
                            lce.is_migrated = True
                            at_least_one_lce_migrated = True
                            dc = DeclarativeContent(content=pulp3content,
                                                    d_artifacts=[da])
                            dc.extra_data = future_relations
                            await self.put(dc)

                        if not at_least_one_lce_migrated:
                            _logger.warn(
                                _('On_demand content cannot be migrated without a remote '
                                  'pulp2 unit_id: {}'.format(
                                      pulp2content.pulp2_id)))
                        future_relations.update(
                            {'lces': list(pulp2lazycatalog)})
                    else:
                        relative_path = (pulp_2to3_detail_content.
                                         relative_path_for_content_artifact)
                        da = DeclarativeArtifact(artifact=artifact,
                                                 url=NOT_USED,
                                                 relative_path=relative_path,
                                                 remote=None,
                                                 deferred_download=False)
                        dc = DeclarativeContent(content=pulp3content,
                                                d_artifacts=[da])
                        dc.extra_data = future_relations
                        await self.put(dc)

                if pb:
                    pb.increment()

                if has_future and dc:
                    futures.append(dc)
                resolve_futures = len(
                    futures) >= DEFAULT_BATCH_SIZE or pb.done == pb.total
                if resolve_futures:
                    for dc in futures:
                        await dc.resolution()
                    futures.clear()
Esempio n. 14
0
    async def run(self):
        """
        Build `DeclarativeContent` from the repodata.
        """
        packages_pb = ProgressReport(message='Parsed Packages',
                                     code='parsing.packages')
        errata_pb = ProgressReport(message='Parsed Erratum',
                                   code='parsing.errata')
        modulemd_pb = ProgressReport(message='Parse Modulemd',
                                     code='parsing.modulemds')
        modulemd_defaults_pb = ProgressReport(
            message='Parse Modulemd-defaults', code='parsing.modulemddefaults')
        comps_pb = ProgressReport(message='Parsed Comps', code='parsing.comps')

        packages_pb.save()
        errata_pb.save()
        comps_pb.save()

        remote_url = self.new_url or self.remote.url
        remote_url = remote_url if remote_url[-1] == "/" else f"{remote_url}/"

        progress_data = dict(message='Downloading Metadata Files',
                             code='downloading.metadata')
        with ProgressReport(**progress_data) as metadata_pb:
            downloader = self.remote.get_downloader(
                url=urljoin(remote_url, 'repodata/repomd.xml'))
            # TODO: decide how to distinguish between a mirror list and a normal repo
            result = await downloader.run()
            metadata_pb.increment()

            if self.kickstart:
                d_artifacts = []
                for path, checksum in self.kickstart["download"][
                        "images"].items():
                    artifact = Artifact(**checksum)

                    da = DeclarativeArtifact(
                        artifact=artifact,
                        url=urljoin(remote_url, path),
                        relative_path=path,
                        remote=self.remote,
                        deferred_download=self.deferred_download)

                    d_artifacts.append(da)

                distribution_tree = DistributionTree(
                    **self.kickstart["distribution_tree"])
                dc = DeclarativeContent(content=distribution_tree,
                                        d_artifacts=d_artifacts)
                dc.extra_data = self.kickstart
                await self.put(dc)

            repomd_path = result.path
            repomd = cr.Repomd(repomd_path)
            package_repodata_urls = {}
            downloaders = []
            modulemd_list = list()
            dc_groups = []
            dc_categories = []
            dc_environments = []
            nevra_to_module = defaultdict(dict)
            pkgname_to_groups = defaultdict(list)
            group_to_categories = defaultdict(list)
            group_to_environments = defaultdict(list)
            optionalgroup_to_environments = defaultdict(list)
            modulemd_results = None
            comps_downloader = None

            for record in repomd.records:
                if record.type in PACKAGE_REPODATA:
                    package_repodata_urls[record.type] = urljoin(
                        remote_url, record.location_href)
                elif record.type in UPDATE_REPODATA:
                    updateinfo_url = urljoin(remote_url, record.location_href)
                    downloader = self.remote.get_downloader(url=updateinfo_url)
                    downloaders.append([downloader.run()])

                elif record.type in COMPS_REPODATA:
                    comps_url = urljoin(remote_url, record.location_href)
                    comps_downloader = self.remote.get_downloader(
                        url=comps_url)

                elif record.type in SKIP_REPODATA:
                    continue

                elif record.type in MODULAR_REPODATA:
                    modules_url = urljoin(remote_url, record.location_href)
                    modulemd_downloader = self.remote.get_downloader(
                        url=modules_url)
                    modulemd_results = await modulemd_downloader.run()

                elif record.type not in PACKAGE_DB_REPODATA:
                    file_data = {
                        record.checksum_type: record.checksum,
                        "size": record.size
                    }
                    da = DeclarativeArtifact(
                        artifact=Artifact(**file_data),
                        url=urljoin(remote_url, record.location_href),
                        relative_path=record.location_href,
                        remote=self.remote,
                        deferred_download=False)
                    repo_metadata_file = RepoMetadataFile(
                        data_type=record.type,
                        checksum_type=record.checksum_type,
                        checksum=record.checksum,
                    )
                    dc = DeclarativeContent(content=repo_metadata_file,
                                            d_artifacts=[da])
                    await self.put(dc)

            # we have to sync module.yaml first if it exists, to make relations to packages
            if modulemd_results:
                modulemd_index = mmdlib.ModuleIndex.new()
                open_func = gzip.open if modulemd_results.url.endswith(
                    '.gz') else open
                with open_func(modulemd_results.path, 'r') as moduleyaml:
                    modulemd_index.update_from_string(
                        moduleyaml.read().decode(), True)

                modulemd_names = modulemd_index.get_module_names() or []
                modulemd_all = parse_modulemd(modulemd_names, modulemd_index)

                modulemd_pb.total = len(modulemd_all)
                modulemd_pb.state = 'running'
                modulemd_pb.save()

                for modulemd in modulemd_all:
                    artifact = modulemd.pop('artifact')
                    relative_path = '{}{}{}{}{}snippet'.format(
                        modulemd[PULP_MODULE_ATTR.NAME],
                        modulemd[PULP_MODULE_ATTR.STREAM],
                        modulemd[PULP_MODULE_ATTR.VERSION],
                        modulemd[PULP_MODULE_ATTR.CONTEXT],
                        modulemd[PULP_MODULE_ATTR.ARCH])
                    da = DeclarativeArtifact(artifact=artifact,
                                             relative_path=relative_path,
                                             url=modules_url)
                    modulemd_content = Modulemd(**modulemd)
                    dc = DeclarativeContent(content=modulemd_content,
                                            d_artifacts=[da])
                    dc.extra_data = defaultdict(list)

                    # dc.content.artifacts are Modulemd artifacts
                    for artifact in json.loads(dc.content.artifacts):
                        nevra_to_module.setdefault(artifact, set()).add(dc)
                    modulemd_list.append(dc)

                modulemd_default_names = parse_defaults(modulemd_index)

                modulemd_defaults_pb.total = len(modulemd_default_names)
                modulemd_defaults_pb.state = 'running'
                modulemd_defaults_pb.save()

                for default in modulemd_default_names:
                    artifact = default.pop('artifact')
                    relative_path = '{}{}snippet'.format(
                        default[PULP_MODULEDEFAULTS_ATTR.MODULE],
                        default[PULP_MODULEDEFAULTS_ATTR.STREAM])
                    da = DeclarativeArtifact(artifact=artifact,
                                             relative_path=relative_path,
                                             url=modules_url)
                    default_content = ModulemdDefaults(**default)
                    modulemd_defaults_pb.increment()
                    dc = DeclarativeContent(content=default_content,
                                            d_artifacts=[da])
                    await self.put(dc)

            if comps_downloader:
                comps_result = await comps_downloader.run()

                comps = libcomps.Comps()
                comps.fromxml_f(comps_result.path)

                comps_pb.total = (len(comps.groups) + len(comps.categories) +
                                  len(comps.environments))
                comps_pb.state = 'running'
                comps_pb.save()

                if comps.langpacks:
                    langpack_dict = PackageLangpacks.libcomps_to_dict(
                        comps.langpacks)
                    packagelangpack = PackageLangpacks(
                        matches=strdict_to_dict(comps.langpacks),
                        digest=dict_digest(langpack_dict))
                    dc = DeclarativeContent(content=packagelangpack)
                    dc.extra_data = defaultdict(list)
                    await self.put(dc)

                if comps.categories:
                    for category in comps.categories:
                        category_dict = PackageCategory.libcomps_to_dict(
                            category)
                        category_dict['digest'] = dict_digest(category_dict)
                        packagecategory = PackageCategory(**category_dict)
                        dc = DeclarativeContent(content=packagecategory)
                        dc.extra_data = defaultdict(list)

                        if packagecategory.group_ids:
                            for group_id in packagecategory.group_ids:
                                group_to_categories[group_id['name']].append(
                                    dc)
                        dc_categories.append(dc)

                if comps.environments:
                    for environment in comps.environments:
                        environment_dict = PackageEnvironment.libcomps_to_dict(
                            environment)
                        environment_dict['digest'] = dict_digest(
                            environment_dict)
                        packageenvironment = PackageEnvironment(
                            **environment_dict)
                        dc = DeclarativeContent(content=packageenvironment)
                        dc.extra_data = defaultdict(list)

                        if packageenvironment.option_ids:
                            for option_id in packageenvironment.option_ids:
                                optionalgroup_to_environments[
                                    option_id['name']].append(dc)

                        if packageenvironment.group_ids:
                            for group_id in packageenvironment.group_ids:
                                group_to_environments[group_id['name']].append(
                                    dc)

                        dc_environments.append(dc)

                if comps.groups:
                    for group in comps.groups:
                        group_dict = PackageGroup.libcomps_to_dict(group)
                        group_dict['digest'] = dict_digest(group_dict)
                        packagegroup = PackageGroup(**group_dict)
                        dc = DeclarativeContent(content=packagegroup)
                        dc.extra_data = defaultdict(list)

                        if packagegroup.packages:
                            for package in packagegroup.packages:
                                pkgname_to_groups[package['name']].append(dc)

                        if dc.content.id in group_to_categories.keys():
                            for dc_category in group_to_categories[
                                    dc.content.id]:
                                dc.extra_data['category_relations'].append(
                                    dc_category)
                                dc_category.extra_data['packagegroups'].append(
                                    dc)

                        if dc.content.id in group_to_environments.keys():
                            for dc_environment in group_to_environments[
                                    dc.content.id]:
                                dc.extra_data['environment_relations'].append(
                                    dc_environment)
                                dc_environment.extra_data[
                                    'packagegroups'].append(dc)

                        if dc.content.id in optionalgroup_to_environments.keys(
                        ):
                            for dc_environment in optionalgroup_to_environments[
                                    dc.content.id]:
                                dc.extra_data['env_relations_optional'].append(
                                    dc_environment)
                                dc_environment.extra_data[
                                    'optionalgroups'].append(dc)

                        dc_groups.append(dc)

                for dc_category in dc_categories:
                    comps_pb.increment()
                    await self.put(dc_category)

                for dc_environment in dc_environments:
                    comps_pb.increment()
                    await self.put(dc_environment)

            # to preserve order, downloaders are created after all repodata urls are identified
            package_repodata_downloaders = []
            for repodata_type in PACKAGE_REPODATA:
                downloader = self.remote.get_downloader(
                    url=package_repodata_urls[repodata_type])
                package_repodata_downloaders.append(downloader.run())

            downloaders.append(package_repodata_downloaders)

            # asyncio.gather is used to preserve the order of results for package repodata
            pending = [
                asyncio.gather(*downloaders_group)
                for downloaders_group in downloaders
            ]

            while pending:
                done, pending = await asyncio.wait(
                    pending, return_when=asyncio.FIRST_COMPLETED)
                for downloader in done:
                    results = downloader.result()
                    if results[0].url == package_repodata_urls['primary']:
                        primary_xml_path = results[0].path
                        filelists_xml_path = results[1].path
                        other_xml_path = results[2].path
                        metadata_pb.done += 3
                        metadata_pb.save()

                        packages = await RpmFirstStage.parse_repodata(
                            primary_xml_path, filelists_xml_path,
                            other_xml_path)
                        packages_pb.total = len(packages)
                        packages_pb.state = 'running'
                        packages_pb.save()

                        for pkg in packages.values():
                            package = Package(
                                **Package.createrepo_to_dict(pkg))
                            artifact = Artifact(size=package.size_package)
                            checksum_type = getattr(
                                CHECKSUM_TYPES, package.checksum_type.upper())
                            setattr(artifact, checksum_type, package.pkgId)
                            url = urljoin(remote_url, package.location_href)
                            filename = os.path.basename(package.location_href)
                            da = DeclarativeArtifact(
                                artifact=artifact,
                                url=url,
                                relative_path=filename,
                                remote=self.remote,
                                deferred_download=self.deferred_download)
                            dc = DeclarativeContent(content=package,
                                                    d_artifacts=[da])
                            dc.extra_data = defaultdict(list)

                            # find if a package relates to a modulemd
                            if dc.content.nevra in nevra_to_module.keys():
                                dc.content.is_modular = True
                                for dc_modulemd in nevra_to_module[
                                        dc.content.nevra]:
                                    dc.extra_data['modulemd_relation'].append(
                                        dc_modulemd)
                                    dc_modulemd.extra_data[
                                        'package_relation'].append(dc)

                            if dc.content.name in pkgname_to_groups.keys():
                                for dc_group in pkgname_to_groups[
                                        dc.content.name]:
                                    dc.extra_data['group_relations'].append(
                                        dc_group)
                                    dc_group.extra_data[
                                        'related_packages'].append(dc)

                            packages_pb.increment()
                            await self.put(dc)

                    elif results[0].url == updateinfo_url:
                        updateinfo_xml_path = results[0].path
                        metadata_pb.increment()

                        updates = await RpmFirstStage.parse_updateinfo(
                            updateinfo_xml_path)

                        errata_pb.total = len(updates)
                        errata_pb.state = 'running'
                        errata_pb.save()

                        for update in updates:
                            update_record = UpdateRecord(
                                **UpdateRecord.createrepo_to_dict(update))
                            update_record.digest = RpmFirstStage.hash_update_record(
                                update)
                            future_relations = {
                                'collections': defaultdict(list),
                                'references': []
                            }

                            for collection in update.collections:
                                coll_dict = UpdateCollection.createrepo_to_dict(
                                    collection)
                                coll = UpdateCollection(**coll_dict)

                                for package in collection.packages:
                                    pkg_dict = UpdateCollectionPackage.createrepo_to_dict(
                                        package)
                                    pkg = UpdateCollectionPackage(**pkg_dict)
                                    future_relations['collections'][
                                        coll].append(pkg)

                            for reference in update.references:
                                reference_dict = UpdateReference.createrepo_to_dict(
                                    reference)
                                ref = UpdateReference(**reference_dict)
                                future_relations['references'].append(ref)

                            errata_pb.increment()
                            dc = DeclarativeContent(content=update_record)
                            dc.extra_data = future_relations
                            await self.put(dc)

            # now send modules down the pipeline since all relations have been set up
            for modulemd in modulemd_list:
                modulemd_pb.increment()
                await self.put(modulemd)

            for dc_group in dc_groups:
                comps_pb.increment()
                await self.put(dc_group)

        packages_pb.state = 'completed'
        errata_pb.state = 'completed'
        modulemd_pb.state = 'completed'
        modulemd_defaults_pb.state = 'completed'
        comps_pb.state = 'completed'
        packages_pb.save()
        errata_pb.save()
        modulemd_pb.save()
        modulemd_defaults_pb.save()
        comps_pb.save()
Esempio n. 15
0
    async def migrate_to_pulp3(self, batch, pb=None):
        """
        A default implementation of DeclarativeContent creation for migrating content to Pulp 3.

        Plugin writers might want to override this method if it doesn't satisfy their needs as is.

        In this implementation there is an assumption that each content has one artifact.

        Args:
            batch: A batch of Pulp2Content objects to migrate to Pulp 3
        """
        def get_remote_by_importer_id(importer_id):
            """
            Args:
                importer_id(str): Id of an importer in Pulp 2

            Returns:
                remote(pulpcore.app.models.Remote): A corresponding remote in Pulp 3

            """
            try:
                pulp2importer = Pulp2Importer.objects.get(
                    pulp2_object_id=importer_id)
            except ObjectDoesNotExist:
                return
            return pulp2importer.pulp3_remote

        for pulp2content in batch:
            pulp_2to3_detail_content = pulp2content.detail_model

            # get all Lazy Catalog Entries (LCEs) for this content
            pulp2lazycatalog = Pulp2LazyCatalog.objects.filter(
                pulp2_unit_id=pulp2content.pulp2_id)

            if not pulp2lazycatalog and not pulp2content.downloaded:
                _logger.warn(
                    _('On_demand content cannot be migrated without an entry in the lazy '
                      'catalog, pulp2 unit_id: {}'.format(
                          pulp2content.pulp2_id)))
                continue

            pulp3content = await pulp_2to3_detail_content.create_pulp3_content(
            )
            future_relations = {'pulp2content': pulp2content}

            artifact = await self.create_artifact(
                pulp2content.pulp2_storage_path,
                pulp_2to3_detail_content.expected_digests,
                pulp_2to3_detail_content.expected_size,
                downloaded=pulp2content.downloaded)
            # Downloaded content with no LCE
            if not pulp2lazycatalog and pulp2content.downloaded:
                da = DeclarativeArtifact(
                    artifact=artifact,
                    url=NOT_USED,
                    relative_path=pulp_2to3_detail_content.
                    relative_path_for_content_artifact,
                    remote=None,
                    deferred_download=False)
                dc = DeclarativeContent(content=pulp3content, d_artifacts=[da])
                dc.extra_data = future_relations
                await self.put(dc)

            # Downloaded or on_demand content with LCEs.
            #
            # To create multiple remote artifacts, create multiple instances of declarative
            # content which will differ by url/remote in their declarative artifacts
            for lce in pulp2lazycatalog:
                remote = get_remote_by_importer_id(lce.pulp2_importer_id)
                deferred_download = not pulp2content.downloaded
                if not remote and deferred_download:
                    _logger.warn(
                        _('On_demand content cannot be migrated without a remote '
                          'pulp2 unit_id: {}'.format(pulp2content.pulp2_id)))
                    continue

                da = DeclarativeArtifact(
                    artifact=artifact,
                    url=lce.pulp2_url,
                    relative_path=pulp_2to3_detail_content.
                    relative_path_for_content_artifact,
                    remote=remote,
                    deferred_download=deferred_download)
                dc = DeclarativeContent(content=pulp3content, d_artifacts=[da])
                dc.extra_data = future_relations
                await self.put(dc)

            if pb:
                pb.increment()
Esempio n. 16
0
    async def migrate_to_pulp3(self, content_model, content_type):
        """
        A default implementation of DeclarativeContent creation for migrating content to Pulp 3.

        Plugin writers might want to override this method if it doesn't satisfy their needs as is.

        In this implementation there is an assumption that each content has one artifact.

        Args:
            batch: A batch of Pulp2Content objects to migrate to Pulp 3
            migrator: A plugin migrator to be used
            content_type: type of pulp2 content that is being mirated
        """
        @functools.lru_cache(maxsize=20)
        def get_remote_by_importer_id(importer_id):
            """
            Args:
                importer_id(str): Id of an importer in Pulp 2

            Returns:
                remote(pulpcore.app.models.Remote): A corresponding remote in Pulp 3

            """
            try:
                pulp2importer = Pulp2Importer.objects.get(
                    pulp2_object_id=importer_id)
            except ObjectDoesNotExist:
                return
            return pulp2importer.pulp3_remote

        futures = []
        is_lazy_type = content_type in self.migrator.lazy_types
        is_artifactless_type = content_type in self.migrator.artifactless_types
        has_future = content_type in self.migrator.future_types
        is_multi_artifact = content_type in self.migrator.multi_artifact_types

        if is_lazy_type:
            # go through all of the content that haven't been migrated OR have been migrated
            # but have new lazy catalog entries.
            units_with_new_lces = (Pulp2LazyCatalog.objects.filter(
                is_migrated=False).values("pulp2_unit_id").distinct())
            already_migrated = ~Q(pulp2content__pulp3_content=None)
            no_new_lces = ~Q(pulp2content__pulp2_id__in=units_with_new_lces)
            pulp_2to3_detail_qs = content_model.objects.exclude(
                already_migrated & no_new_lces)
        else:
            # go through all of the content that haven't been migrated
            pulp_2to3_detail_qs = content_model.objects.filter(
                pulp2content__pulp3_content=None)

        # order by pulp2_repo if it's set
        if content_model.set_pulp2_repo:
            pulp_2to3_detail_qs = pulp_2to3_detail_qs.order_by("repo_id")

        async with ProgressReport(
                message="Migrating {} content to Pulp 3".format(content_type),
                code="migrating.{}.content".format(self.migrator.pulp2_plugin),
                total=await sync_to_async(pulp_2to3_detail_qs.count)(),
        ) as pb:
            select_extra = [
                "pulp2content",
                "pulp2content__pulp3_content",
            ]

            if content_model.set_pulp2_repo:
                select_extra.append("pulp2content__pulp2_repo")

            pulp_2to3_detail_qs = pulp_2to3_detail_qs.select_related(
                *select_extra)
            async for pulp_2to3_detail_content in sync_to_async_iterable(
                    pulp_2to3_detail_qs.iterator(chunk_size=800)):
                dc = None
                pulp2content = await sync_to_async(
                    Pulp2Content.objects.get
                )(pk=pulp_2to3_detail_content.pulp2content.pk)

                # only content that supports on_demand download can have entries in LCE
                if is_lazy_type:
                    # get all Lazy Catalog Entries (LCEs) for this content
                    pulp2lazycatalog = Pulp2LazyCatalog.objects.filter(
                        pulp2_unit_id=pulp2content.pulp2_id,
                        is_migrated=False,
                    )
                    await sync_to_async(bool)(pulp2lazycatalog
                                              )  # force queryset to evaluate

                    if not pulp2content.downloaded and not pulp2lazycatalog:
                        # A distribution tree can be from an on_demand repo but without any images,
                        # e.g. CentOS 8 High Availability. Do not skip in that case.
                        if not is_multi_artifact:
                            _logger.warn(
                                _("On_demand content cannot be migrated without an entry in the "
                                  "lazy catalog, pulp2 unit_id: {}".format(
                                      pulp2content.pulp2_id)))
                            continue

                if (pulp2content.pulp3_content is not None and is_lazy_type
                        and pulp2lazycatalog):
                    # find already created pulp3 content
                    pulp3content = pulp2content.pulp3_content
                    extra_info = None
                    if is_multi_artifact:
                        extra_info = pulp_2to3_detail_content.get_treeinfo_serialized(
                        )
                        # If we can't find the .treeinfo for the Distribution, warn and skip
                        if extra_info is None:
                            _logger.warning(
                                _("Failed to find or instantiate extra_info for multi-artifact "
                                  "pulp2 unit_id: {} ; skipping".format(
                                      pulp2content.pulp2_id)))
                            continue
                else:
                    # create pulp3 content and assign relations if present
                    pulp3content, extra_info = await sync_to_async(
                        pulp_2to3_detail_content.create_pulp3_content)()

                # If we can't find/create the Distribution, warn and skip
                if pulp3content is None:
                    _logger.warning(
                        _("Failed to find or instantiate pulp3 content for pulp2 unit_id: {} ;"
                          " skipping".format(pulp2content.pulp2_id)))
                    continue

                future_relations = {"pulp2content": pulp2content}
                if extra_info:
                    future_relations.update(extra_info)

                if is_multi_artifact:
                    d_artifacts = []
                    base_path = pulp2content.pulp2_storage_path
                    remotes = set()
                    missing_artifact = False
                    remote_declarative_artifacts = []

                    for image_relative_path in extra_info["download"][
                            "images"]:
                        remote_url_tuples = []
                        image_path = os.path.join(base_path,
                                                  image_relative_path)
                        downloaded = os.path.exists(image_path)
                        if downloaded:
                            artifact = await self.create_artifact(
                                image_path, None, None, downloaded=downloaded)
                            if artifact is None:
                                continue
                        else:
                            artifact = Artifact()

                        lces = await sync_to_async(list)(
                            pulp2lazycatalog.filter(
                                pulp2_storage_path=image_path))

                        if not lces and not downloaded:
                            continue

                        # collect all urls and respective migrated remotes for the image
                        for lce in lces:
                            remote = await sync_to_async(
                                get_remote_by_importer_id)(
                                    lce.pulp2_importer_id)
                            if remote:
                                remotes.add(remote)
                                remote_url_tuples.append(
                                    (remote, lce.pulp2_url))

                        for remote, url in remote_url_tuples:
                            da = DeclarativeArtifact(
                                artifact=artifact,
                                url=lce.pulp2_url,
                                relative_path=image_relative_path,
                                remote=remote,
                                deferred_download=not downloaded,
                            )
                            remote_declarative_artifacts.append(da)

                        if not remote_url_tuples:
                            # either no LCEs existed but it's a downloaded content (and we can
                            # proceed), or remotes for any of LCEs haven't been migrated (and
                            # nothing can be done at this point)
                            if not downloaded:
                                missing_artifact = True
                                break

                            da = DeclarativeArtifact(
                                artifact=artifact,
                                url=NOT_USED,
                                relative_path=image_relative_path,
                                remote=None,
                                deferred_download=False,
                            )
                            d_artifacts.append(da)

                        d_artifacts.extend(remote_declarative_artifacts)

                    # Only skip the rest of the steps if there are any images that are expected
                    # to be downloaded. There are distribution trees without images in the wild,
                    # e.g. CentOS 8 High Availability.
                    if missing_artifact and extra_info["download"]["images"]:
                        _logger.warn(
                            _("On_demand content cannot be migrated without a remote "
                              "pulp2 unit_id: {}".format(
                                  pulp2content.pulp2_id)))
                        continue

                    for lce in pulp2lazycatalog:
                        lce.is_migrated = True
                    future_relations.update({"lces": list(pulp2lazycatalog)})

                    # We do this last because we need the remote url which is only found in the LCE
                    # of the image files. There is no LCE for the .treeinfo file itself.
                    relative_path = (pulp_2to3_detail_content.
                                     relative_path_for_content_artifact)
                    treeinfo_path = os.path.join(
                        pulp2content.pulp2_storage_path, relative_path)
                    artifact = await self.create_artifact(treeinfo_path,
                                                          None,
                                                          None,
                                                          downloaded=True)
                    if artifact is None:
                        continue
                    if remotes:
                        for remote in remotes:
                            da = DeclarativeArtifact(
                                artifact=artifact,
                                url=os.path.join(remote.url, relative_path),
                                relative_path=relative_path,
                                remote=remote,
                                deferred_download=False,
                            )
                            d_artifacts.append(da)
                    else:
                        da = DeclarativeArtifact(
                            artifact=artifact,
                            url=NOT_USED,
                            relative_path=relative_path,
                            remote=None,
                            deferred_download=False,
                        )
                        d_artifacts.append(da)
                    dc = DeclarativeContent(content=pulp3content,
                                            d_artifacts=d_artifacts)
                    dc.extra_data = future_relations
                    await self.put(dc)
                # not all content units have files, create DC without artifact
                elif is_artifactless_type:
                    # dc without artifact
                    dc = DeclarativeContent(content=pulp3content)
                    dc.extra_data = future_relations
                    await self.put(dc)
                else:

                    # create artifact for content that has file
                    artifact = await self.create_artifact(
                        pulp2content.pulp2_storage_path,
                        pulp_2to3_detail_content.expected_digests,
                        pulp_2to3_detail_content.expected_size,
                        downloaded=pulp2content.downloaded,
                    )
                    if artifact is None:
                        if pb:
                            await pb.aincrement()
                        continue

                    relative_path = (pulp_2to3_detail_content.
                                     relative_path_for_content_artifact)
                    remote_lce_tuples = []
                    deferred_download = not pulp2content.downloaded

                    if is_lazy_type and pulp2lazycatalog:
                        for lce in pulp2lazycatalog:
                            remote = await sync_to_async(
                                get_remote_by_importer_id)(
                                    lce.pulp2_importer_id)
                            if remote:
                                remote_lce_tuples.append((remote, lce))

                        # handle DA and RA creation for content that supports on_demand
                        # Downloaded or on_demand content with LCEs.
                        #
                        # To create multiple remote artifacts, create multiple instances of
                        # declarative content which will differ by url/remote in their
                        # declarative artifacts

                    if remote_lce_tuples:
                        for remote, lce in remote_lce_tuples:
                            da = DeclarativeArtifact(
                                artifact=artifact,
                                url=lce.pulp2_url,
                                relative_path=relative_path,
                                remote=remote,
                                deferred_download=deferred_download,
                            )
                            lce.is_migrated = True
                            dc = DeclarativeContent(content=pulp3content,
                                                    d_artifacts=[da])

                            # yes, all LCEs are assigned for each dc to be resolved at a later
                            # stage. Some LCEs might be "bad" and not have a migrated importer
                            # but we still need to resolved such. It creates some duplicated LCEs
                            # to process later but ensures that all are resolved if at least one
                            # valid one is migrated.
                            future_relations.update(
                                {"lces": list(pulp2lazycatalog)})
                            dc.extra_data = future_relations
                            await self.put(dc)

                    else:
                        # No migratable LCE available
                        if deferred_download:
                            _logger.warn(
                                _("On_demand content cannot be migrated without a remote "
                                  "pulp2 unit_id: {}".format(
                                      pulp2content.pulp2_id)))
                            continue

                        da = DeclarativeArtifact(
                            artifact=artifact,
                            url=NOT_USED,
                            relative_path=relative_path,
                            remote=None,
                            deferred_download=False,
                        )
                        dc = DeclarativeContent(content=pulp3content,
                                                d_artifacts=[da])
                        dc.extra_data = future_relations
                        await self.put(dc)

                if pb:
                    await pb.aincrement()

                if has_future and dc:
                    futures.append(dc)
                resolve_futures = len(futures) >= DEFAULT_BATCH_SIZE
                if resolve_futures:
                    for dc in futures:
                        await dc.resolution()
                    futures.clear()

            # resolve futures if there are any left
            for dc in futures:
                await dc.resolution()
            futures.clear()