async def create_pulp3_content(self): """ Create a Pulp 3 Package content for saving it later in a bulk operation. """ cr_package = await get_cr_obj(self) pkg_dict = Package.createrepo_to_dict(cr_package) return Package(**pkg_dict)
def create_pulp3_content(self): """ Create a Pulp 3 Package content for saving it later in a bulk operation. """ cr_package = get_cr_obj(self) pkg_dict = Package.createrepo_to_dict(cr_package) pkg_dict['is_modular'] = self.is_modular return (Package(**pkg_dict), None)
def deferred_validate(self, data): """ Validate the rpm package data. Args: data (dict): Data to be validated Returns: dict: Data that has been validated """ data = super().deferred_validate(data) # export META from rpm and prepare dict as saveable format try: new_pkg = Package.createrepo_to_dict( read_crpackage_from_artifact(data["artifact"])) except OSError: log.info(traceback.format_exc()) raise NotAcceptable( detail="RPM file cannot be parsed for metadata") attrs = {key: new_pkg[key] for key in Package.natural_key_fields()} package = Package.objects.filter(**attrs) if package.exists(): keywords = ( "name", "epoch", "version", "release", "arch", "checksum_type", "pkgId", ) error_data = ", ".join([ "=".join(item) for item in new_pkg.items() if item[0] in keywords ]) package.get().touch() raise serializers.ValidationError( _("There is already a package with: {values}.").format( values=error_data)) new_pkg["location_href"] = (format_nevra_short( new_pkg["name"], new_pkg["epoch"], new_pkg["version"], new_pkg["release"], new_pkg["arch"], ) + ".rpm") if not data.get("relative_path"): data["relative_path"] = new_pkg["location_href"] data.update(new_pkg) return data
def publish_artifacts(self, content, prefix=""): """ Publish artifacts. Args: content (pulpcore.plugin.models.Content): content set. prefix (str): a relative path prefix for the published artifact """ published_artifacts = [] # Special case for Packages contentartifact_qs = ContentArtifact.objects.filter( content__in=content).filter( content__pulp_type=Package.get_pulp_type()) for content_artifact in contentartifact_qs.values( "pk", "relative_path").iterator(): relative_path = content_artifact["relative_path"] relative_path = os.path.join(prefix, PACKAGES_DIRECTORY, relative_path.lower()[0], relative_path) published_artifacts.append( PublishedArtifact( relative_path=relative_path, publication=self.publication, content_artifact_id=content_artifact["pk"], )) # Handle everything else is_treeinfo = Q(relative_path__in=["treeinfo", ".treeinfo"]) unpublishable_types = Q(content__pulp_type__in=[ RepoMetadataFile.get_pulp_type(), Modulemd.get_pulp_type(), ModulemdDefaults.get_pulp_type(), # already dealt with Package.get_pulp_type(), ]) contentartifact_qs = (ContentArtifact.objects.filter( content__in=content).exclude(unpublishable_types).exclude( is_treeinfo)) for content_artifact in contentartifact_qs.values( "pk", "relative_path").iterator(): published_artifacts.append( PublishedArtifact( relative_path=content_artifact["relative_path"], publication=self.publication, content_artifact_id=content_artifact["pk"], )) PublishedArtifact.objects.bulk_create(published_artifacts, batch_size=2000)
async def _parse_packages(self, packages): progress_data = { "message": "Parsed Packages", "code": "sync.parsing.packages", "total": len(packages), } with ProgressReport(**progress_data) as packages_pb: while True: try: (_, pkg) = packages.popitem(last=False) except KeyError: break package = Package(**Package.createrepo_to_dict(pkg)) del pkg artifact = Artifact(size=package.size_package) checksum_type = getattr(CHECKSUM_TYPES, package.checksum_type.upper()) setattr(artifact, checksum_type, package.pkgId) url = urlpath_sanitize(self.data.remote_url, package.location_href) filename = os.path.basename(package.location_href) da = DeclarativeArtifact( artifact=artifact, url=url, relative_path=filename, remote=self.remote, deferred_download=self.deferred_download, ) dc = DeclarativeContent(content=package, d_artifacts=[da]) dc.extra_data = defaultdict(list) # find if a package relates to a modulemd if dc.content.nevra in self.data.nevra_to_module.keys(): dc.content.is_modular = True for dc_modulemd in self.data.nevra_to_module[ dc.content.nevra]: dc.extra_data["modulemd_relation"].append(dc_modulemd) dc_modulemd.extra_data["package_relation"].append(dc) if dc.content.name in self.data.pkgname_to_groups.keys(): for dc_group in self.data.pkgname_to_groups[ dc.content.name]: dc.extra_data["group_relations"].append(dc_group) dc_group.extra_data["related_packages"].append(dc) packages_pb.increment() await self.put(dc)
def _prepare_package(artifact, filename): """ Helper function for creating package. Copy file to a temp directory under the user provided filename and parsing it into a saveable format. Returns: artifact model as dict Args: artifact: inited and validated artifact to save filename: name of file uploaded by user """ # Copy file to a temp directory under the user provided filename with tempfile.TemporaryDirectory() as td: temp_path = os.path.join(td, filename) shutil.copy2(artifact.file.path, temp_path) cr_pkginfo = createrepo_c.package_from_rpm(temp_path) package = Package.createrepo_to_dict(cr_pkginfo) package['location_href'] = filename # parsing it into a saveable format new_pkg = {} for key, value in package.items(): if isinstance(value, list): new_pkg[key] = json.dumps(value) else: new_pkg[key] = value return new_pkg
def publish_artifacts(self, content): """ Publish artifacts. Args: content (pulpcore.plugin.models.Content): content set. """ published_artifacts = [] for content_artifact in ContentArtifact.objects.filter( content__in=content.exclude(pulp_type__in=[ RepoMetadataFile.get_pulp_type(), Modulemd.get_pulp_type(), ModulemdDefaults.get_pulp_type() ]).distinct()).iterator(): relative_path = content_artifact.relative_path if content_artifact.content.pulp_type == Package.get_pulp_type(): relative_path = os.path.join(PACKAGES_DIRECTORY, relative_path.lower()[0], content_artifact.relative_path) published_artifacts.append( PublishedArtifact(relative_path=relative_path, publication=self.publication, content_artifact=content_artifact)) PublishedArtifact.objects.bulk_create(published_artifacts, batch_size=2000)
def deferred_validate(self, data): """ Validate the rpm package data. Args: data (dict): Data to be validated Returns: dict: Data that has been validated """ data = super().deferred_validate(data) # export META from rpm and prepare dict as saveable format try: new_pkg = _prepare_package(data["artifact"], data["relative_path"]) except OSError: raise NotAcceptable(detail='RPM file cannot be parsed for metadata.') attrs = {key: new_pkg[key] for key in Package.natural_key_fields()} package = Package.objects.filter(**attrs) if package.exists(): keywords = ('name', 'epoch', 'version', 'release', 'arch', 'checksum_type', 'pkgId') error_data = ", ".join( ["=".join(item) for item in new_pkg.items() if item[0] in keywords] ) raise serializers.ValidationError( _( "There is already a package with: {values}." ).format(values=error_data) ) data.update(new_pkg) return data
def _prepare_package(artifact, filename): """ Helper function for creating package. Copy file to a temp directory under the user provided filename. Returns: artifact model as dict Args: artifact: inited and validated artifact to save filename: name of file uploaded by user """ artifact_file = storage.open(artifact.file.name) with tempfile.NamedTemporaryFile("wb", dir=".", suffix=filename) as temp_file: shutil.copyfileobj(artifact_file, temp_file) temp_file.flush() cr_pkginfo = createrepo_c.package_from_rpm( temp_file.name, changelog_limit=settings.KEEP_CHANGELOG_LIMIT) package = Package.createrepo_to_dict(cr_pkginfo) package["location_href"] = filename artifact_file.close() return package
def _apply_retention_policy(self, new_version): """Apply the repository's "retain_package_versions" settings to the new version. Remove all non-modular packages that are older than the retention policy. A value of 0 for the package retention policy represents disabled. A value of 3 would mean that the 3 most recent versions of each package would be kept while older versions are discarded. Args: new_version (models.RepositoryVersion): Repository version to filter """ assert not new_version.complete, \ "Cannot apply retention policy to completed repository versions" if self.retain_package_versions > 0: # It would be more ideal if, instead of annotating with an age and filtering manually, # we could use Django to filter the particular Package content we want to delete. # Something like ".filter(F('age') > self.retain_package_versions)" would be better # however this is not currently possible with Django. It would be possible with raw # SQL but the repository version content membership subquery is currently # django-managed and would be difficult to share. # # Instead we have to do the filtering manually. nonmodular_packages = Package.objects.with_age().filter( pk__in=new_version.content.filter(pulp_type=Package.get_pulp_type()), is_modular=False, # don't want to filter out modular RPMs ).only('pk') old_packages = [] for package in nonmodular_packages: if package.age > self.retain_package_versions: old_packages.append(package.pk) new_version.remove_content(Content.objects.filter(pk__in=old_packages))
async def _parse_packages(self, packages): progress_data = { 'message': 'Parsed Packages', 'code': 'parsing.packages', 'total': len(packages), } with ProgressReport(**progress_data) as packages_pb: for pkg in packages.values(): package = Package(**Package.createrepo_to_dict(pkg)) artifact = Artifact(size=package.size_package) checksum_type = getattr(CHECKSUM_TYPES, package.checksum_type.upper()) setattr(artifact, checksum_type, package.pkgId) url = urljoin(self.data.remote_url, package.location_href) filename = os.path.basename(package.location_href) da = DeclarativeArtifact( artifact=artifact, url=url, relative_path=filename, remote=self.remote, deferred_download=self.deferred_download) dc = DeclarativeContent(content=package, d_artifacts=[da]) dc.extra_data = defaultdict(list) # find if a package relates to a modulemd if dc.content.nevra in self.data.nevra_to_module.keys(): dc.content.is_modular = True for dc_modulemd in self.data.nevra_to_module[ dc.content.nevra]: dc.extra_data['modulemd_relation'].append(dc_modulemd) dc_modulemd.extra_data['package_relation'].append(dc) if dc.content.name in self.data.pkgname_to_groups.keys(): for dc_group in self.data.pkgname_to_groups[ dc.content.name]: dc.extra_data['group_relations'].append(dc_group) dc_group.extra_data['related_packages'].append(dc) packages_pb.increment() await self.put(dc)
def create(self, request): """ Create a new Package from a request. """ try: artifact = self.get_resource(request.data['_artifact'], Artifact) except KeyError: raise serializers.ValidationError( detail={'_artifact': _('This field is required')}) try: filename = request.data['filename'] except KeyError: raise serializers.ValidationError( detail={'filename': _('This field is required')}) # Copy file to a temp directory under the user provided filename with tempfile.TemporaryDirectory() as td: temp_path = os.path.join(td, filename) shutil.copy2(artifact.file.path, temp_path) cr_pkginfo = createrepo_c.package_from_rpm(temp_path) package = Package.createrepo_to_dict(cr_pkginfo) package['location_href'] = filename # TODO: Clean this up, maybe make a new function for the purpose of parsing it into # a saveable format new_pkg = {} new_pkg['_artifact'] = request.data['_artifact'] for key, value in package.items(): if isinstance(value, list): new_pkg[key] = json.dumps(value) else: new_pkg[key] = value serializer = self.get_serializer(data=new_pkg) serializer.is_valid(raise_exception=True) serializer.validated_data.pop('_artifact') package = serializer.save() if package.pk: ContentArtifact.objects.create(artifact=artifact, content=package, relative_path=package.filename) headers = self.get_success_headers(request.data) return Response(serializer.data, status=status.HTTP_201_CREATED, headers=headers)
def _prepare_package(artifact, filename): """ Helper function for creating package. Copy file to a temp directory under the user provided filename. Returns: artifact model as dict Args: artifact: inited and validated artifact to save filename: name of file uploaded by user """ artifact_file = storage.open(artifact.file.name) with tempfile.NamedTemporaryFile('wb', suffix=filename) as temp_file: shutil.copyfileobj(artifact_file, temp_file) temp_file.flush() cr_pkginfo = createrepo_c.package_from_rpm(temp_file.name) package = Package.createrepo_to_dict(cr_pkginfo) package['location_href'] = filename return package
def _prepare_package(artifact, filename): """ Helper function for creating package. Copy file to a temp directory under the user provided filename. Returns: artifact model as dict Args: artifact: inited and validated artifact to save filename: name of file uploaded by user """ # Copy file to a temp directory under the user provided filename with tempfile.TemporaryDirectory() as td: temp_path = os.path.join(td, filename) shutil.copy2(artifact.file.path, temp_path) cr_pkginfo = createrepo_c.package_from_rpm(temp_path) package = Package.createrepo_to_dict(cr_pkginfo) package['location_href'] = filename return package
def find_children_of_content(content, repository_version): """Finds the content referenced directly by other content and returns it all together. Finds RPMs referenced by Advisory/Errata content. Args: content (iterable): Content for which to resolve children repository_version (pulpcore.models.RepositoryVersion): Source repo version Returns: Queryset of Content objects that are children of the intial set of content """ # Advisories that were selected to be copied advisory_ids = content.filter( pulp_type=UpdateRecord.get_pulp_type()).only('pk') # All packages in the source repository version package_ids = repository_version.content.filter( pulp_type=Package.get_pulp_type()).only('pk') # All modules in the source repository version module_ids = repository_version.content.filter( pulp_type=Modulemd.get_pulp_type()).only('pk') advisories = UpdateRecord.objects.filter(pk__in=advisory_ids) packages = Package.objects.filter(pk__in=package_ids) modules = Modulemd.objects.filter(pk__in=module_ids) children = set() for advisory in advisories: # Find rpms referenced by Advisories/Errata package_nevras = advisory.get_pkglist() for nevra in package_nevras: (name, epoch, version, release, arch) = nevra try: package = packages.get(name=name, epoch=epoch, version=version, release=release, arch=arch) children.add(package.pk) except Package.DoesNotExist: raise except MultipleObjectsReturned: raise module_nsvcas = advisory.get_module_list() for nsvca in module_nsvcas: (name, stream, version, context, arch) = nsvca try: module = modules.get(name=name, stream=stream, version=version, context=context, arch=arch) children.add(module.pk) except Modulemd.DoesNotExist: raise except MultipleObjectsReturned: raise # TODO: Find rpms referenced by PackageGroups, # PackageGroups referenced by PackageCategories, etc. return Content.objects.filter(pk__in=children)
def find_children_of_content(content, src_repo_version): """Finds the content referenced directly by other content and returns it all together. Finds RPMs referenced by Advisory/Errata content. Args: content (iterable): Content for which to resolve children src_repo_version (pulpcore.models.RepositoryVersion): Source repo version Returns: Queryset of Content objects that are children of the intial set of content """ # Content that were selected to be copied advisory_ids = content.filter( pulp_type=UpdateRecord.get_pulp_type()).only('pk') packagecategory_ids = content.filter( pulp_type=PackageCategory.get_pulp_type()).only('pk') packageenvironment_ids = content.filter( pulp_type=PackageEnvironment.get_pulp_type()).only('pk') packagegroup_ids = content.filter( pulp_type=PackageGroup.get_pulp_type()).only('pk') # Content in the source repository version package_ids = src_repo_version.content.filter( pulp_type=Package.get_pulp_type()).only('pk') module_ids = src_repo_version.content.filter( pulp_type=Modulemd.get_pulp_type()).only('pk') advisories = UpdateRecord.objects.filter(pk__in=advisory_ids) packages = Package.objects.filter(pk__in=package_ids) packagecategories = PackageCategory.objects.filter( pk__in=packagecategory_ids) packageenvironments = PackageEnvironment.objects.filter( pk__in=packageenvironment_ids) packagegroups = PackageGroup.objects.filter(pk__in=packagegroup_ids) modules = Modulemd.objects.filter(pk__in=module_ids) children = set() for advisory in advisories: # Find rpms referenced by Advisories/Errata package_nevras = advisory.get_pkglist() for nevra in package_nevras: (name, epoch, version, release, arch) = nevra try: package = packages.get(name=name, epoch=epoch, version=version, release=release, arch=arch) children.add(package.pk) except Package.DoesNotExist: raise except MultipleObjectsReturned: raise module_nsvcas = advisory.get_module_list() for nsvca in module_nsvcas: (name, stream, version, context, arch) = nsvca try: module = modules.get(name=name, stream=stream, version=version, context=context, arch=arch) children.add(module.pk) except Modulemd.DoesNotExist: raise except MultipleObjectsReturned: raise # PackageCategories & PackageEnvironments resolution must go before PackageGroups # TODO: refactor to be more effecient (lower number of queries) for packagecategory in packagecategories.iterator(): for category_package_group in packagecategory.group_ids: category_package_groups = PackageGroup.objects.filter( name=category_package_group['name'], pk__in=src_repo_version.content) children.update( [pkggroup.pk for pkggroup in category_package_groups]) packagegroups = packagegroups.union(category_package_groups) for packageenvironment in packageenvironments.iterator(): for env_package_group in packageenvironment.group_ids: env_package_groups = PackageGroup.objects.filter( name=env_package_group['name'], pk__in=src_repo_version.content) children.update([envgroup.pk for envgroup in env_package_groups]) packagegroups = packagegroups.union(env_package_groups) for optional_env_package_group in packageenvironment.option_ids: opt_env_package_groups = PackageGroup.objects.filter( name=optional_env_package_group['name'], pk__in=src_repo_version.content) children.update( [optpkggroup.pk for optpkggroup in opt_env_package_groups]) packagegroups = packagegroups.union(opt_env_package_groups) # Find rpms referenced by PackageGroups for packagegroup in packagegroups.iterator(): group_package_names = [pkg['name'] for pkg in packagegroup.packages] for pkg in group_package_names: packages_by_name = [ pkg for pkg in Package.objects.with_age().filter( name=pkg, pk__in=src_repo_version.content) if pkg.age == 1 ] for pkg in packages_by_name: children.add(pkg.pk) return Content.objects.filter(pk__in=children)
async def run(self): """ Build `DeclarativeContent` from the repodata. """ remote_url = self.new_url or self.remote.url remote_url = remote_url if remote_url[-1] == "/" else f"{remote_url}/" optimize_sync = self.optimize progress_data = dict(message='Downloading Metadata Files', code='downloading.metadata') with ProgressReport(**progress_data) as metadata_pb: downloader = self.remote.get_downloader( url=urljoin(remote_url, 'repodata/repomd.xml')) # TODO: decide how to distinguish between a mirror list and a normal repo result = await downloader.run() metadata_pb.increment() repomd_path = result.path repomd = cr.Repomd(repomd_path) # Caution: we are not storing when the remote was last updated, so the order of this # logic must remain in this order where we first check the version number as other # changes than sync could have taken place such that the date or repo version will be # different from last sync if (optimize_sync and self.repository.last_sync_remote and self.remote.pk == self.repository.last_sync_remote.pk and (self.repository.last_sync_repo_version == self.repository.latest_version().number) and (self.remote.pulp_last_updated <= self.repository.latest_version().pulp_created) and is_previous_version( repomd.revision, self.repository.last_sync_revision_number)): optimize_data = dict(message='Optimizing Sync', code='optimizing.sync') with ProgressReport(**optimize_data) as optimize_pb: optimize_pb.done = 1 optimize_pb.save() return self.repository.last_sync_revision_number = repomd.revision if self.treeinfo: d_artifacts = [ DeclarativeArtifact( artifact=Artifact(), url=urljoin(remote_url, self.treeinfo["filename"]), relative_path=".treeinfo", remote=self.remote, deferred_download=False, ) ] for path, checksum in self.treeinfo["download"][ "images"].items(): artifact = Artifact(**checksum) da = DeclarativeArtifact( artifact=artifact, url=urljoin(remote_url, path), relative_path=path, remote=self.remote, deferred_download=self.deferred_download) d_artifacts.append(da) distribution_tree = DistributionTree( **self.treeinfo["distribution_tree"]) dc = DeclarativeContent(content=distribution_tree, d_artifacts=d_artifacts) dc.extra_data = self.treeinfo await self.put(dc) package_repodata_urls = {} downloaders = [] modulemd_list = list() dc_groups = [] dc_categories = [] dc_environments = [] nevra_to_module = defaultdict(dict) pkgname_to_groups = defaultdict(list) group_to_categories = defaultdict(list) group_to_environments = defaultdict(list) optionalgroup_to_environments = defaultdict(list) modulemd_results = None comps_downloader = None main_types = set() checksums = {} for record in repomd.records: checksums[record.type] = record.checksum_type.upper() if record.type in PACKAGE_REPODATA: main_types.update([record.type]) package_repodata_urls[record.type] = urljoin( remote_url, record.location_href) elif record.type in UPDATE_REPODATA: updateinfo_url = urljoin(remote_url, record.location_href) downloader = self.remote.get_downloader(url=updateinfo_url) downloaders.append([downloader.run()]) elif record.type in COMPS_REPODATA: comps_url = urljoin(remote_url, record.location_href) comps_downloader = self.remote.get_downloader( url=comps_url) elif record.type in SKIP_REPODATA: continue elif '_zck' in record.type: continue elif record.type in MODULAR_REPODATA: modules_url = urljoin(remote_url, record.location_href) modulemd_downloader = self.remote.get_downloader( url=modules_url) modulemd_results = await modulemd_downloader.run() elif record.type not in PACKAGE_DB_REPODATA: file_data = { record.checksum_type: record.checksum, "size": record.size } da = DeclarativeArtifact( artifact=Artifact(**file_data), url=urljoin(remote_url, record.location_href), relative_path=record.location_href, remote=self.remote, deferred_download=False) repo_metadata_file = RepoMetadataFile( data_type=record.type, checksum_type=record.checksum_type, checksum=record.checksum, ) dc = DeclarativeContent(content=repo_metadata_file, d_artifacts=[da]) await self.put(dc) missing_type = set(PACKAGE_REPODATA) - main_types if missing_type: raise FileNotFoundError( _("XML file(s): {filename} not found").format( filename=", ".join(missing_type))) self.repository.original_checksum_types = checksums # we have to sync module.yaml first if it exists, to make relations to packages if modulemd_results: modulemd_index = mmdlib.ModuleIndex.new() open_func = gzip.open if modulemd_results.url.endswith( '.gz') else open with open_func(modulemd_results.path, 'r') as moduleyaml: content = moduleyaml.read() module_content = content if isinstance( content, str) else content.decode() modulemd_index.update_from_string(module_content, True) modulemd_names = modulemd_index.get_module_names() or [] modulemd_all = parse_modulemd(modulemd_names, modulemd_index) # Parsing modules happens all at one time, and from here on no useful work happens. # So just report that it finished this stage. modulemd_pb_data = { 'message': 'Parsed Modulemd', 'code': 'parsing.modulemds' } with ProgressReport(**modulemd_pb_data) as modulemd_pb: modulemd_total = len(modulemd_all) modulemd_pb.total = modulemd_total modulemd_pb.done = modulemd_total for modulemd in modulemd_all: artifact = modulemd.pop('artifact') relative_path = '{}{}{}{}{}snippet'.format( modulemd[PULP_MODULE_ATTR.NAME], modulemd[PULP_MODULE_ATTR.STREAM], modulemd[PULP_MODULE_ATTR.VERSION], modulemd[PULP_MODULE_ATTR.CONTEXT], modulemd[PULP_MODULE_ATTR.ARCH]) da = DeclarativeArtifact(artifact=artifact, relative_path=relative_path, url=modules_url) modulemd_content = Modulemd(**modulemd) dc = DeclarativeContent(content=modulemd_content, d_artifacts=[da]) dc.extra_data = defaultdict(list) # dc.content.artifacts are Modulemd artifacts for artifact in dc.content.artifacts: nevra_to_module.setdefault(artifact, set()).add(dc) modulemd_list.append(dc) # delete list now that we're done with it for memory savings del modulemd_all modulemd_default_names = parse_defaults(modulemd_index) # Parsing module-defaults happens all at one time, and from here on no useful # work happens. So just report that it finished this stage. modulemd_defaults_pb_data = { 'message': 'Parsed Modulemd-defaults', 'code': 'parsing.modulemd_defaults' } with ProgressReport( **modulemd_defaults_pb_data) as modulemd_defaults_pb: modulemd_defaults_total = len(modulemd_default_names) modulemd_defaults_pb.total = modulemd_defaults_total modulemd_defaults_pb.done = modulemd_defaults_total for default in modulemd_default_names: artifact = default.pop('artifact') relative_path = '{}{}snippet'.format( default[PULP_MODULEDEFAULTS_ATTR.MODULE], default[PULP_MODULEDEFAULTS_ATTR.STREAM]) da = DeclarativeArtifact(artifact=artifact, relative_path=relative_path, url=modules_url) default_content = ModulemdDefaults(**default) dc = DeclarativeContent(content=default_content, d_artifacts=[da]) await self.put(dc) # delete list now that we're done with it for memory savings del modulemd_default_names if comps_downloader: comps_result = await comps_downloader.run() comps = libcomps.Comps() comps.fromxml_f(comps_result.path) with ProgressReport(message='Parsed Comps', code='parsing.comps') as comps_pb: comps_total = (len(comps.groups) + len(comps.categories) + len(comps.environments)) comps_pb.total = comps_total comps_pb.done = comps_total if comps.langpacks: langpack_dict = PackageLangpacks.libcomps_to_dict( comps.langpacks) packagelangpack = PackageLangpacks( matches=strdict_to_dict(comps.langpacks), digest=dict_digest(langpack_dict)) dc = DeclarativeContent(content=packagelangpack) dc.extra_data = defaultdict(list) await self.put(dc) if comps.categories: for category in comps.categories: category_dict = PackageCategory.libcomps_to_dict( category) category_dict['digest'] = dict_digest(category_dict) packagecategory = PackageCategory(**category_dict) dc = DeclarativeContent(content=packagecategory) dc.extra_data = defaultdict(list) if packagecategory.group_ids: for group_id in packagecategory.group_ids: group_to_categories[group_id['name']].append( dc) dc_categories.append(dc) if comps.environments: for environment in comps.environments: environment_dict = PackageEnvironment.libcomps_to_dict( environment) environment_dict['digest'] = dict_digest( environment_dict) packageenvironment = PackageEnvironment( **environment_dict) dc = DeclarativeContent(content=packageenvironment) dc.extra_data = defaultdict(list) if packageenvironment.option_ids: for option_id in packageenvironment.option_ids: optionalgroup_to_environments[ option_id['name']].append(dc) if packageenvironment.group_ids: for group_id in packageenvironment.group_ids: group_to_environments[group_id['name']].append( dc) dc_environments.append(dc) if comps.groups: for group in comps.groups: group_dict = PackageGroup.libcomps_to_dict(group) group_dict['digest'] = dict_digest(group_dict) packagegroup = PackageGroup(**group_dict) dc = DeclarativeContent(content=packagegroup) dc.extra_data = defaultdict(list) if packagegroup.packages: for package in packagegroup.packages: pkgname_to_groups[package['name']].append(dc) if dc.content.id in group_to_categories.keys(): for dc_category in group_to_categories[ dc.content.id]: dc.extra_data['category_relations'].append( dc_category) dc_category.extra_data['packagegroups'].append( dc) if dc.content.id in group_to_environments.keys(): for dc_environment in group_to_environments[ dc.content.id]: dc.extra_data['environment_relations'].append( dc_environment) dc_environment.extra_data[ 'packagegroups'].append(dc) if dc.content.id in optionalgroup_to_environments.keys( ): for dc_environment in optionalgroup_to_environments[ dc.content.id]: dc.extra_data['env_relations_optional'].append( dc_environment) dc_environment.extra_data[ 'optionalgroups'].append(dc) dc_groups.append(dc) for dc_category in dc_categories: await self.put(dc_category) for dc_environment in dc_environments: await self.put(dc_environment) # delete lists now that we're done with them for memory savings del dc_environments del dc_categories # to preserve order, downloaders are created after all repodata urls are identified package_repodata_downloaders = [] for repodata_type in PACKAGE_REPODATA: downloader = self.remote.get_downloader( url=package_repodata_urls[repodata_type]) package_repodata_downloaders.append(downloader.run()) downloaders.append(package_repodata_downloaders) # asyncio.gather is used to preserve the order of results for package repodata pending = [ asyncio.gather(*downloaders_group) for downloaders_group in downloaders ] while pending: done, pending = await asyncio.wait( pending, return_when=asyncio.FIRST_COMPLETED) for downloader in done: try: results = downloader.result() except ClientResponseError as exc: raise HTTPNotFound( reason=_("File not found: {filename}").format( filename=exc.request_info.url)) if results[0].url == package_repodata_urls['primary']: primary_xml_path = results[0].path filelists_xml_path = results[1].path other_xml_path = results[2].path metadata_pb.done += 3 metadata_pb.save() packages = await RpmFirstStage.parse_repodata( primary_xml_path, filelists_xml_path, other_xml_path) # skip SRPM if defined if 'srpm' in self.skip_types: packages = { pkgId: pkg for pkgId, pkg in packages.items() if pkg.arch != 'src' } progress_data = { 'message': 'Parsed Packages', 'code': 'parsing.packages', 'total': len(packages), } with ProgressReport(**progress_data) as packages_pb: for pkg in packages.values(): package = Package( **Package.createrepo_to_dict(pkg)) artifact = Artifact(size=package.size_package) checksum_type = getattr( CHECKSUM_TYPES, package.checksum_type.upper()) setattr(artifact, checksum_type, package.pkgId) url = urljoin(remote_url, package.location_href) filename = os.path.basename( package.location_href) da = DeclarativeArtifact( artifact=artifact, url=url, relative_path=filename, remote=self.remote, deferred_download=self.deferred_download) dc = DeclarativeContent(content=package, d_artifacts=[da]) dc.extra_data = defaultdict(list) # find if a package relates to a modulemd if dc.content.nevra in nevra_to_module.keys(): dc.content.is_modular = True for dc_modulemd in nevra_to_module[ dc.content.nevra]: dc.extra_data[ 'modulemd_relation'].append( dc_modulemd) dc_modulemd.extra_data[ 'package_relation'].append(dc) if dc.content.name in pkgname_to_groups.keys(): for dc_group in pkgname_to_groups[ dc.content.name]: dc.extra_data[ 'group_relations'].append(dc_group) dc_group.extra_data[ 'related_packages'].append(dc) packages_pb.increment() await self.put(dc) elif results[0].url == updateinfo_url: updateinfo_xml_path = results[0].path metadata_pb.increment() updates = await RpmFirstStage.parse_updateinfo( updateinfo_xml_path) progress_data = { 'message': 'Parsed Advisories', 'code': 'parsing.advisories', 'total': len(updates), } with ProgressReport(**progress_data) as advisories_pb: for update in updates: update_record = UpdateRecord( **UpdateRecord.createrepo_to_dict(update)) update_record.digest = hash_update_record( update) future_relations = { 'collections': defaultdict(list), 'references': [] } for collection in update.collections: coll_dict = UpdateCollection.createrepo_to_dict( collection) coll = UpdateCollection(**coll_dict) for package in collection.packages: pkg_dict = UpdateCollectionPackage.createrepo_to_dict( package) pkg = UpdateCollectionPackage( **pkg_dict) future_relations['collections'][ coll].append(pkg) for reference in update.references: reference_dict = UpdateReference.createrepo_to_dict( reference) ref = UpdateReference(**reference_dict) future_relations['references'].append(ref) advisories_pb.increment() dc = DeclarativeContent(content=update_record) dc.extra_data = future_relations await self.put(dc) # now send modules down the pipeline since all relations have been set up for modulemd in modulemd_list: await self.put(modulemd) for dc_group in dc_groups: await self.put(dc_group)
def publish( repository_version_pk, gpgcheck_options=None, metadata_signing_service=None, checksum_types=None, sqlite_metadata=False, ): """ Create a Publication based on a RepositoryVersion. Args: repository_version_pk (str): Create a publication from this repository version. gpgcheck_options (dict): GPG signature check options. metadata_signing_service (pulpcore.app.models.AsciiArmoredDetachedSigningService): A reference to an associated signing service. checksum_types (dict): Checksum types for metadata and packages. sqlite_metadata (bool): Whether to generate metadata files in sqlite format. """ repository_version = RepositoryVersion.objects.get(pk=repository_version_pk) repository = repository_version.repository.cast() checksum_types = checksum_types or {} if metadata_signing_service: metadata_signing_service = AsciiArmoredDetachedSigningService.objects.get( pk=metadata_signing_service ) checksum_types["original"] = repository.original_checksum_types log.info( _("Publishing: repository={repo}, version={version}").format( repo=repository.name, version=repository_version.number, ) ) with tempfile.TemporaryDirectory("."): with RpmPublication.create(repository_version) as publication: kwargs = {} first_package = repository_version.content.filter( pulp_type=Package.get_pulp_type() ).first() if first_package: kwargs["default"] = first_package.cast().checksum_type publication.metadata_checksum_type = get_checksum_type( "primary", checksum_types, **kwargs ) publication.package_checksum_type = ( checksum_types.get("package") or publication.metadata_checksum_type ) if gpgcheck_options is not None: publication.gpgcheck = gpgcheck_options.get("gpgcheck") publication.repo_gpgcheck = gpgcheck_options.get("repo_gpgcheck") if sqlite_metadata: publication.sqlite_metadata = True publication_data = PublicationData(publication) publication_data.populate() total_repos = 1 + len(publication_data.sub_repos) pb_data = dict( message="Generating repository metadata", code="publish.generating_metadata", total=total_repos, ) with ProgressReport(**pb_data) as publish_pb: content = publication.repository_version.content # Main repo generate_repo_metadata( content, publication, checksum_types, publication_data.repomdrecords, metadata_signing_service=metadata_signing_service, ) publish_pb.increment() for sub_repo in publication_data.sub_repos: name = sub_repo[0] checksum_types["original"] = getattr(publication_data, f"{name}_checksums") content = getattr(publication_data, f"{name}_content") extra_repomdrecords = getattr(publication_data, f"{name}_repomdrecords") generate_repo_metadata( content, publication, checksum_types, extra_repomdrecords, name, metadata_signing_service=metadata_signing_service, ) publish_pb.increment() log.info(_("Publication: {publication} created").format(publication=publication.pk)) return publication
def publish_artifacts(self, content, prefix=""): """ Publish artifacts. Args: content (pulpcore.plugin.models.Content): content set. prefix (str): a relative path prefix for the published artifact """ published_artifacts = [] # Special case for Packages contentartifact_qs = ContentArtifact.objects.filter(content__in=content).filter( content__pulp_type=Package.get_pulp_type() ) paths = set() duplicated_paths = [] for content_artifact in contentartifact_qs.values("pk", "relative_path").iterator(): relative_path = content_artifact["relative_path"] relative_path = os.path.join( prefix, PACKAGES_DIRECTORY, relative_path.lower()[0], relative_path ) # # Some Suboptimal Repos have the 'same' artifact living in multiple places. # Specifically, the same NEVRA, in more than once place, **with different checksums** # (since if all that was different was location_href there would be only one # ContentArtifact in the first place). # # pulp_rpm wants to publish a 'canonical' repository-layout, under which an RPM # "name-version-release-arch" appears at "Packages/n/name-version-release-arch.rpm". # Because the assumption is that Packages don't "own" their path, only the filename # is kept as relative_path. # # In this case, we have to pick one - which is essentially what the rest of the RPM # Ecosystem does when faced with the impossible. This code takes the first-found. We # could implement something more complicated, if there are better options # (choose by last-created maybe?) # # Note that this only impacts user-created publications, which produce the "standard" # RPM layout of repo/Packages/f/foo.rpm. A publication created by mirror-sync retains # whatever layout their "upstream" repo-metadata dictates. # if relative_path in paths: duplicated_paths.append(f'{relative_path}:{content_artifact["pk"]}') continue else: paths.add(relative_path) published_artifacts.append( PublishedArtifact( relative_path=relative_path, publication=self.publication, content_artifact_id=content_artifact["pk"], ) ) if duplicated_paths: log.warning( _("Duplicate paths found at publish : {problems} ").format( problems="; ".join(duplicated_paths) ) ) # Handle everything else is_treeinfo = Q(relative_path__in=["treeinfo", ".treeinfo"]) unpublishable_types = Q( content__pulp_type__in=[ RepoMetadataFile.get_pulp_type(), Modulemd.get_pulp_type(), ModulemdDefaults.get_pulp_type(), # already dealt with Package.get_pulp_type(), ] ) contentartifact_qs = ( ContentArtifact.objects.filter(content__in=content) .exclude(unpublishable_types) .exclude(is_treeinfo) ) for content_artifact in contentartifact_qs.values("pk", "relative_path").iterator(): published_artifacts.append( PublishedArtifact( relative_path=content_artifact["relative_path"], publication=self.publication, content_artifact_id=content_artifact["pk"], ) ) PublishedArtifact.objects.bulk_create(published_artifacts, batch_size=2000)
def find_children_of_content(content, src_repo_version): """Finds the content referenced directly by other content and returns it all together. Finds RPMs referenced by Advisory/Errata content. Args: content (Queryset): Content for which to resolve children src_repo_version (pulpcore.models.RepositoryVersion): Source repo version Returns: Queryset of Content objects that are children of the intial set of content """ # Content that were selected to be copied advisory_ids = content.filter( pulp_type=UpdateRecord.get_pulp_type()).only("pk") packagecategory_ids = content.filter( pulp_type=PackageCategory.get_pulp_type()).only("pk") packageenvironment_ids = content.filter( pulp_type=PackageEnvironment.get_pulp_type()).only("pk") packagegroup_ids = content.filter( pulp_type=PackageGroup.get_pulp_type()).only("pk") # Content in the source repository version package_ids = src_repo_version.content.filter( pulp_type=Package.get_pulp_type()).only("pk") module_ids = src_repo_version.content.filter( pulp_type=Modulemd.get_pulp_type()).only("pk") advisories = UpdateRecord.objects.filter(pk__in=advisory_ids) packages = Package.objects.filter(pk__in=package_ids) packagecategories = PackageCategory.objects.filter( pk__in=packagecategory_ids) packageenvironments = PackageEnvironment.objects.filter( pk__in=packageenvironment_ids) packagegroups = PackageGroup.objects.filter(pk__in=packagegroup_ids) modules = Modulemd.objects.filter(pk__in=module_ids) children = set() for advisory in advisories.iterator(): # Find rpms referenced by Advisories/Errata package_nevras = advisory.get_pkglist() advisory_package_q = Q(pk__in=[]) for nevra in package_nevras: (name, epoch, version, release, arch) = nevra advisory_package_q |= Q(name=name, epoch=epoch, version=version, release=release, arch=arch) children.update( packages.filter(advisory_package_q).values_list("pk", flat=True)) module_nsvcas = advisory.get_module_list() advisory_module_q = Q(pk__in=[]) for nsvca in module_nsvcas: (name, stream, version, context, arch) = nsvca advisory_module_q |= Q(name=name, stream=stream, version=version, context=context, arch=arch) children.update( modules.filter(advisory_module_q).values_list("pk", flat=True)) # PackageCategories & PackageEnvironments resolution must go before PackageGroups packagegroup_names = set() for packagecategory in packagecategories.iterator(): for group_id in packagecategory.group_ids: packagegroup_names.add(group_id["name"]) for packageenvironment in packageenvironments.iterator(): for group_id in packageenvironment.group_ids: packagegroup_names.add(group_id["name"]) for group_id in packageenvironment.option_ids: packagegroup_names.add(group_id["name"]) child_package_groups = PackageGroup.objects.filter( name__in=packagegroup_names, pk__in=src_repo_version.content) children.update([pkggroup.pk for pkggroup in child_package_groups]) packagegroups = packagegroups.union(child_package_groups) # Find rpms referenced by PackageGroups packagegroup_package_names = set() for packagegroup in packagegroups.iterator(): packagegroup_package_names |= set(pkg["name"] for pkg in packagegroup.packages) # TODO: do modular/nonmodular need to be taken into account? existing_package_names = (Package.objects.filter( name__in=packagegroup_package_names, pk__in=content, ).values_list("name", flat=True).distinct()) missing_package_names = packagegroup_package_names - set( existing_package_names) needed_packages = Package.objects.with_age().filter( name__in=missing_package_names, pk__in=src_repo_version.content) # Pick the latest version of each package available which isn't already present # in the content set. for pkg in needed_packages.iterator(): if pkg.age == 1: children.add(pkg.pk) return Content.objects.filter(pk__in=children)
async def run(self): """ Build `DeclarativeContent` from the repodata. """ packages_pb = ProgressBar(message='Parsed Packages') erratum_pb = ProgressBar(message='Parsed Erratum') packages_pb.save() erratum_pb.save() with ProgressBar(message='Downloading Metadata Files') as metadata_pb: downloader = self.remote.get_downloader( url=urljoin(self.remote.url, 'repodata/repomd.xml')) # TODO: decide how to distinguish between a mirror list and a normal repo result = await downloader.run() metadata_pb.increment() repomd_path = result.path repomd = cr.Repomd(repomd_path) package_repodata_urls = {} downloaders = [] for record in repomd.records: if record.type in PACKAGE_REPODATA: package_repodata_urls[record.type] = urljoin( self.remote.url, record.location_href) elif record.type in UPDATE_REPODATA: updateinfo_url = urljoin(self.remote.url, record.location_href) downloader = self.remote.get_downloader(url=updateinfo_url) downloaders.append([downloader.run()]) else: log.info( _('Unknown repodata type: {t}. Skipped.').format( t=record.type)) # TODO: skip databases, save unknown types to publish them as-is # to preserve order, downloaders are created after all repodata urls are identified package_repodata_downloaders = [] for repodata_type in PACKAGE_REPODATA: downloader = self.remote.get_downloader( url=package_repodata_urls[repodata_type]) package_repodata_downloaders.append(downloader.run()) downloaders.append(package_repodata_downloaders) # asyncio.gather is used to preserve the order of results for package repodata pending = [ asyncio.gather(*downloaders_group) for downloaders_group in downloaders ] while pending: done, pending = await asyncio.wait( pending, return_when=asyncio.FIRST_COMPLETED) for downloader in done: results = downloader.result() if results[0].url == package_repodata_urls['primary']: primary_xml_path = results[0].path filelists_xml_path = results[1].path other_xml_path = results[2].path metadata_pb.done += 3 metadata_pb.save() packages = await RpmFirstStage.parse_repodata( primary_xml_path, filelists_xml_path, other_xml_path) packages_pb.total = len(packages) packages_pb.state = 'running' packages_pb.save() for pkg in packages.values(): package = Package( **Package.createrepo_to_dict(pkg)) artifact = Artifact(size=package.size_package) checksum_type = getattr( CHECKSUM_TYPES, package.checksum_type.upper()) setattr(artifact, checksum_type, package.pkgId) url = urljoin(self.remote.url, package.location_href) filename = os.path.basename(package.location_href) da = DeclarativeArtifact( artifact=artifact, url=url, relative_path=filename, remote=self.remote, deferred_download=self.deferred_download) dc = DeclarativeContent(content=package, d_artifacts=[da]) packages_pb.increment() await self.put(dc) elif results[0].url == updateinfo_url: updateinfo_xml_path = results[0].path metadata_pb.increment() updates = await RpmFirstStage.parse_updateinfo( updateinfo_xml_path) erratum_pb.total = len(updates) erratum_pb.state = 'running' erratum_pb.save() for update in updates: update_record = UpdateRecord( **UpdateRecord.createrepo_to_dict(update)) update_record.digest = RpmFirstStage.hash_update_record( update) future_relations = { 'collections': defaultdict(list), 'references': [] } for collection in update.collections: coll_dict = UpdateCollection.createrepo_to_dict( collection) coll = UpdateCollection(**coll_dict) for package in collection.packages: pkg_dict = UpdateCollectionPackage.createrepo_to_dict( package) pkg = UpdateCollectionPackage(**pkg_dict) future_relations['collections'][ coll].append(pkg) for reference in update.references: reference_dict = UpdateReference.createrepo_to_dict( reference) ref = UpdateReference(**reference_dict) future_relations['references'].append(ref) erratum_pb.increment() dc = DeclarativeContent(content=update_record) dc.extra_data = future_relations await self.put(dc) packages_pb.state = 'completed' erratum_pb.state = 'completed' packages_pb.save() erratum_pb.save()
def publish_artifacts(self, content, prefix=""): """ Publish artifacts. Args: content (pulpcore.plugin.models.Content): content set. prefix (str): a relative path prefix for the published artifact """ published_artifacts = [] # Special case for Packages contentartifact_qs = (ContentArtifact.objects.filter( content__in=content).filter( content__pulp_type=Package.get_pulp_type()).select_related( "content__rpm_package__time_build")) rel_path_mapping = defaultdict(list) # Some Suboptimal Repos have the 'same' artifact living in multiple places. # Specifically, the same NEVRA, in more than once place, **with different checksums** # (since if all that was different was location_href there would be only one # ContentArtifact in the first place). # # pulp_rpm wants to publish a 'canonical' repository-layout, under which an RPM # "name-version-release-arch" appears at "Packages/n/name-version-release-arch.rpm". # Because the assumption is that Packages don't "own" their path, only the filename # is kept as relative_path. # # In this case, we have to pick one - which is essentially what the rest of the RPM # Ecosystem does when faced with the impossible. This code takes the one with the # most recent build time which is the same heuristic used by Yum/DNF/Zypper. # # Note that this only impacts user-created publications, which produce the "standard" # RPM layout of repo/Packages/f/foo.rpm. A publication created by mirror-sync retains # whatever layout their "upstream" repo-metadata dictates. fields = ["pk", "relative_path", "content__rpm_package__time_build"] for content_artifact in contentartifact_qs.values(*fields).iterator(): relative_path = content_artifact["relative_path"] time_build = content_artifact["content__rpm_package__time_build"] relative_path = os.path.join(prefix, PACKAGES_DIRECTORY, relative_path.lower()[0], relative_path) rel_path_mapping[relative_path].append( (content_artifact["pk"], time_build)) for rel_path, content_artifacts in rel_path_mapping.items(): # sort the content artifacts by when the package was built if len(content_artifacts) > 1: content_artifacts.sort(key=lambda p: p[1], reverse=True) log.warning( "Duplicate packages found competing for {path}, selected the one with " "the most recent build time, excluding {others} others.". format(path=rel_path, others=len(content_artifacts[1:]))) # Only add the first one (the one with the highest build time) published_artifacts.append( PublishedArtifact( relative_path=rel_path, publication=self.publication, content_artifact_id=content_artifacts[0][0], )) # Handle everything else is_treeinfo = Q(relative_path__in=["treeinfo", ".treeinfo"]) unpublishable_types = Q(content__pulp_type__in=[ RepoMetadataFile.get_pulp_type(), Modulemd.get_pulp_type(), ModulemdDefaults.get_pulp_type(), # already dealt with Package.get_pulp_type(), ]) contentartifact_qs = (ContentArtifact.objects.filter( content__in=content).exclude(unpublishable_types).exclude( is_treeinfo)) for content_artifact in contentartifact_qs.values( "pk", "relative_path").iterator(): published_artifacts.append( PublishedArtifact( relative_path=content_artifact["relative_path"], publication=self.publication, content_artifact_id=content_artifact["pk"], )) PublishedArtifact.objects.bulk_create(published_artifacts, batch_size=2000)
async def __call__(self, in_q, out_q): """ Build `DeclarativeContent` from the repodata. Args: in_q (asyncio.Queue): Unused because the first stage doesn't read from an input queue. out_q (asyncio.Queue): The out_q to send `DeclarativeContent` objects to """ with ProgressBar(message='Downloading and Parsing Metadata') as pb: downloader = self.remote.get_downloader( urljoin(self.remote.url, 'repodata/repomd.xml')) # TODO: decide how to distinguish between a mirror list and a normal repo result = await downloader.run() pb.increment() repomd_path = result.path repomd = cr.Repomd(repomd_path) package_repodata_urls = {} downloaders = [] for record in repomd.records: if record.type in PACKAGE_REPODATA: package_repodata_urls[record.type] = urljoin( self.remote.url, record.location_href) elif record.type in UPDATE_REPODATA: updateinfo_url = urljoin(self.remote.url, record.location_href) downloader = self.remote.get_downloader(updateinfo_url) downloaders.append([downloader.run()]) else: log.info( _('Unknown repodata type: {t}. Skipped.').format( t=record.type)) # TODO: skip databases, save unknown types to publish them as-is # to preserve order, downloaders are created after all repodata urls are identified package_repodata_downloaders = [] for repodata_type in PACKAGE_REPODATA: downloader = self.remote.get_downloader( package_repodata_urls[repodata_type]) package_repodata_downloaders.append(downloader.run()) downloaders.append(package_repodata_downloaders) # asyncio.gather is used to preserve the order of results for package repodata pending = [ asyncio.gather(*downloaders_group) for downloaders_group in downloaders ] while pending: done, pending = await asyncio.wait( pending, return_when=asyncio.FIRST_COMPLETED) for downloader in done: results = downloader.result() if results[0].url == package_repodata_urls['primary']: primary_xml_path = results[0].path filelists_xml_path = results[1].path other_xml_path = results[2].path pb.done += 3 pb.save() packages = await RpmFirstStage.parse_repodata( primary_xml_path, filelists_xml_path, other_xml_path) for pkg in packages.values(): package = Package( **Package.createrepo_to_dict(pkg)) artifact = Artifact(size=package.size_package) checksum_type = getattr( CHECKSUM_TYPES, package.checksum_type.upper()) setattr(artifact, checksum_type, package.pkgId) url = urljoin(self.remote.url, package.location_href) da = DeclarativeArtifact(artifact, url, package.location_href, self.remote) dc = DeclarativeContent(content=package, d_artifacts=[da]) await out_q.put(dc) elif results[0].url == updateinfo_url: updateinfo_xml_path = results[0].path pb.increment() updates = await RpmFirstStage.parse_updateinfo( updateinfo_xml_path) for update in updates: update_record = UpdateRecord( **UpdateRecord.createrepo_to_dict(update)) update_record.digest = RpmFirstStage.hash_update_record( update) for collection in update.collections: coll_dict = UpdateCollection.createrepo_to_dict( collection) coll = UpdateCollection(**coll_dict) for package in collection.packages: pkg_dict = UpdateCollectionPackage.createrepo_to_dict( package) pkg = UpdateCollectionPackage(**pkg_dict) coll._packages.append(pkg) update_record._collections.append(coll) for reference in update.references: reference_dict = UpdateReference.createrepo_to_dict( reference) update_record._references.append( UpdateReference(**reference_dict)) dc = DeclarativeContent(content=update_record) await out_q.put(dc) await out_q.put(None)
async def run(self): """ Build `DeclarativeContent` from the repodata. """ with ProgressBar(message='Downloading and Parsing Metadata') as pb: downloader = self.remote.get_downloader( url=urljoin(self.remote.url, 'repodata/repomd.xml') ) # TODO: decide how to distinguish between a mirror list and a normal repo result = await downloader.run() pb.increment() repomd_path = result.path repomd = cr.Repomd(repomd_path) package_repodata_urls = {} downloaders = [] for record in repomd.records: if record.type in PACKAGE_REPODATA: package_repodata_urls[record.type] = urljoin(self.remote.url, record.location_href) elif record.type in UPDATE_REPODATA: updateinfo_url = urljoin(self.remote.url, record.location_href) downloader = self.remote.get_downloader(url=updateinfo_url) downloaders.append([downloader.run()]) else: log.info(_('Unknown repodata type: {t}. Skipped.').format(t=record.type)) # TODO: skip databases, save unknown types to publish them as-is # to preserve order, downloaders are created after all repodata urls are identified package_repodata_downloaders = [] for repodata_type in PACKAGE_REPODATA: downloader = self.remote.get_downloader(url=package_repodata_urls[repodata_type]) package_repodata_downloaders.append(downloader.run()) downloaders.append(package_repodata_downloaders) # asyncio.gather is used to preserve the order of results for package repodata pending = [asyncio.gather(*downloaders_group) for downloaders_group in downloaders] while pending: done, pending = await asyncio.wait(pending, return_when=asyncio.FIRST_COMPLETED) for downloader in done: results = downloader.result() if results[0].url == package_repodata_urls['primary']: primary_xml_path = results[0].path filelists_xml_path = results[1].path other_xml_path = results[2].path pb.done += 3 pb.save() packages = await RpmFirstStage.parse_repodata(primary_xml_path, filelists_xml_path, other_xml_path) for pkg in packages.values(): package = Package(**Package.createrepo_to_dict(pkg)) artifact = Artifact(size=package.size_package) checksum_type = getattr(CHECKSUM_TYPES, package.checksum_type.upper()) setattr(artifact, checksum_type, package.pkgId) url = urljoin(self.remote.url, package.location_href) filename = os.path.basename(package.location_href) da = DeclarativeArtifact( artifact=artifact, url=url, relative_path=filename, remote=self.remote, deferred_download=self.deferred_download ) dc = DeclarativeContent(content=package, d_artifacts=[da]) await self.put(dc) elif results[0].url == updateinfo_url: updateinfo_xml_path = results[0].path pb.increment() updates = await RpmFirstStage.parse_updateinfo(updateinfo_xml_path) for update in updates: update_record = UpdateRecord(**UpdateRecord.createrepo_to_dict(update)) update_record.digest = RpmFirstStage.hash_update_record(update) for collection in update.collections: coll_dict = UpdateCollection.createrepo_to_dict(collection) coll = UpdateCollection(**coll_dict) for package in collection.packages: pkg_dict = UpdateCollectionPackage.createrepo_to_dict(package) pkg = UpdateCollectionPackage(**pkg_dict) coll._packages.append(pkg) update_record._collections.append(coll) for reference in update.references: reference_dict = UpdateReference.createrepo_to_dict(reference) update_record._references.append(UpdateReference(**reference_dict)) dc = DeclarativeContent(content=update_record) await self.put(dc)
async def run(self): """ Build `DeclarativeContent` from the repodata. """ packages_pb = ProgressReport(message='Parsed Packages', code='parsing.packages') errata_pb = ProgressReport(message='Parsed Erratum', code='parsing.errata') modulemd_pb = ProgressReport(message='Parse Modulemd', code='parsing.modulemds') modulemd_defaults_pb = ProgressReport( message='Parse Modulemd-defaults', code='parsing.modulemddefaults') comps_pb = ProgressReport(message='Parsed Comps', code='parsing.comps') packages_pb.save() errata_pb.save() comps_pb.save() remote_url = self.new_url or self.remote.url remote_url = remote_url if remote_url[-1] == "/" else f"{remote_url}/" progress_data = dict(message='Downloading Metadata Files', code='downloading.metadata') with ProgressReport(**progress_data) as metadata_pb: downloader = self.remote.get_downloader( url=urljoin(remote_url, 'repodata/repomd.xml')) # TODO: decide how to distinguish between a mirror list and a normal repo result = await downloader.run() metadata_pb.increment() if self.kickstart: d_artifacts = [] for path, checksum in self.kickstart["download"][ "images"].items(): artifact = Artifact(**checksum) da = DeclarativeArtifact( artifact=artifact, url=urljoin(remote_url, path), relative_path=path, remote=self.remote, deferred_download=self.deferred_download) d_artifacts.append(da) distribution_tree = DistributionTree( **self.kickstart["distribution_tree"]) dc = DeclarativeContent(content=distribution_tree, d_artifacts=d_artifacts) dc.extra_data = self.kickstart await self.put(dc) repomd_path = result.path repomd = cr.Repomd(repomd_path) package_repodata_urls = {} downloaders = [] modulemd_list = list() dc_groups = [] dc_categories = [] dc_environments = [] nevra_to_module = defaultdict(dict) pkgname_to_groups = defaultdict(list) group_to_categories = defaultdict(list) group_to_environments = defaultdict(list) optionalgroup_to_environments = defaultdict(list) modulemd_results = None comps_downloader = None for record in repomd.records: if record.type in PACKAGE_REPODATA: package_repodata_urls[record.type] = urljoin( remote_url, record.location_href) elif record.type in UPDATE_REPODATA: updateinfo_url = urljoin(remote_url, record.location_href) downloader = self.remote.get_downloader(url=updateinfo_url) downloaders.append([downloader.run()]) elif record.type in COMPS_REPODATA: comps_url = urljoin(remote_url, record.location_href) comps_downloader = self.remote.get_downloader( url=comps_url) elif record.type in SKIP_REPODATA: continue elif record.type in MODULAR_REPODATA: modules_url = urljoin(remote_url, record.location_href) modulemd_downloader = self.remote.get_downloader( url=modules_url) modulemd_results = await modulemd_downloader.run() elif record.type not in PACKAGE_DB_REPODATA: file_data = { record.checksum_type: record.checksum, "size": record.size } da = DeclarativeArtifact( artifact=Artifact(**file_data), url=urljoin(remote_url, record.location_href), relative_path=record.location_href, remote=self.remote, deferred_download=False) repo_metadata_file = RepoMetadataFile( data_type=record.type, checksum_type=record.checksum_type, checksum=record.checksum, ) dc = DeclarativeContent(content=repo_metadata_file, d_artifacts=[da]) await self.put(dc) # we have to sync module.yaml first if it exists, to make relations to packages if modulemd_results: modulemd_index = mmdlib.ModuleIndex.new() open_func = gzip.open if modulemd_results.url.endswith( '.gz') else open with open_func(modulemd_results.path, 'r') as moduleyaml: modulemd_index.update_from_string( moduleyaml.read().decode(), True) modulemd_names = modulemd_index.get_module_names() or [] modulemd_all = parse_modulemd(modulemd_names, modulemd_index) modulemd_pb.total = len(modulemd_all) modulemd_pb.state = 'running' modulemd_pb.save() for modulemd in modulemd_all: artifact = modulemd.pop('artifact') relative_path = '{}{}{}{}{}snippet'.format( modulemd[PULP_MODULE_ATTR.NAME], modulemd[PULP_MODULE_ATTR.STREAM], modulemd[PULP_MODULE_ATTR.VERSION], modulemd[PULP_MODULE_ATTR.CONTEXT], modulemd[PULP_MODULE_ATTR.ARCH]) da = DeclarativeArtifact(artifact=artifact, relative_path=relative_path, url=modules_url) modulemd_content = Modulemd(**modulemd) dc = DeclarativeContent(content=modulemd_content, d_artifacts=[da]) dc.extra_data = defaultdict(list) # dc.content.artifacts are Modulemd artifacts for artifact in json.loads(dc.content.artifacts): nevra_to_module.setdefault(artifact, set()).add(dc) modulemd_list.append(dc) modulemd_default_names = parse_defaults(modulemd_index) modulemd_defaults_pb.total = len(modulemd_default_names) modulemd_defaults_pb.state = 'running' modulemd_defaults_pb.save() for default in modulemd_default_names: artifact = default.pop('artifact') relative_path = '{}{}snippet'.format( default[PULP_MODULEDEFAULTS_ATTR.MODULE], default[PULP_MODULEDEFAULTS_ATTR.STREAM]) da = DeclarativeArtifact(artifact=artifact, relative_path=relative_path, url=modules_url) default_content = ModulemdDefaults(**default) modulemd_defaults_pb.increment() dc = DeclarativeContent(content=default_content, d_artifacts=[da]) await self.put(dc) if comps_downloader: comps_result = await comps_downloader.run() comps = libcomps.Comps() comps.fromxml_f(comps_result.path) comps_pb.total = (len(comps.groups) + len(comps.categories) + len(comps.environments)) comps_pb.state = 'running' comps_pb.save() if comps.langpacks: langpack_dict = PackageLangpacks.libcomps_to_dict( comps.langpacks) packagelangpack = PackageLangpacks( matches=strdict_to_dict(comps.langpacks), digest=dict_digest(langpack_dict)) dc = DeclarativeContent(content=packagelangpack) dc.extra_data = defaultdict(list) await self.put(dc) if comps.categories: for category in comps.categories: category_dict = PackageCategory.libcomps_to_dict( category) category_dict['digest'] = dict_digest(category_dict) packagecategory = PackageCategory(**category_dict) dc = DeclarativeContent(content=packagecategory) dc.extra_data = defaultdict(list) if packagecategory.group_ids: for group_id in packagecategory.group_ids: group_to_categories[group_id['name']].append( dc) dc_categories.append(dc) if comps.environments: for environment in comps.environments: environment_dict = PackageEnvironment.libcomps_to_dict( environment) environment_dict['digest'] = dict_digest( environment_dict) packageenvironment = PackageEnvironment( **environment_dict) dc = DeclarativeContent(content=packageenvironment) dc.extra_data = defaultdict(list) if packageenvironment.option_ids: for option_id in packageenvironment.option_ids: optionalgroup_to_environments[ option_id['name']].append(dc) if packageenvironment.group_ids: for group_id in packageenvironment.group_ids: group_to_environments[group_id['name']].append( dc) dc_environments.append(dc) if comps.groups: for group in comps.groups: group_dict = PackageGroup.libcomps_to_dict(group) group_dict['digest'] = dict_digest(group_dict) packagegroup = PackageGroup(**group_dict) dc = DeclarativeContent(content=packagegroup) dc.extra_data = defaultdict(list) if packagegroup.packages: for package in packagegroup.packages: pkgname_to_groups[package['name']].append(dc) if dc.content.id in group_to_categories.keys(): for dc_category in group_to_categories[ dc.content.id]: dc.extra_data['category_relations'].append( dc_category) dc_category.extra_data['packagegroups'].append( dc) if dc.content.id in group_to_environments.keys(): for dc_environment in group_to_environments[ dc.content.id]: dc.extra_data['environment_relations'].append( dc_environment) dc_environment.extra_data[ 'packagegroups'].append(dc) if dc.content.id in optionalgroup_to_environments.keys( ): for dc_environment in optionalgroup_to_environments[ dc.content.id]: dc.extra_data['env_relations_optional'].append( dc_environment) dc_environment.extra_data[ 'optionalgroups'].append(dc) dc_groups.append(dc) for dc_category in dc_categories: comps_pb.increment() await self.put(dc_category) for dc_environment in dc_environments: comps_pb.increment() await self.put(dc_environment) # to preserve order, downloaders are created after all repodata urls are identified package_repodata_downloaders = [] for repodata_type in PACKAGE_REPODATA: downloader = self.remote.get_downloader( url=package_repodata_urls[repodata_type]) package_repodata_downloaders.append(downloader.run()) downloaders.append(package_repodata_downloaders) # asyncio.gather is used to preserve the order of results for package repodata pending = [ asyncio.gather(*downloaders_group) for downloaders_group in downloaders ] while pending: done, pending = await asyncio.wait( pending, return_when=asyncio.FIRST_COMPLETED) for downloader in done: results = downloader.result() if results[0].url == package_repodata_urls['primary']: primary_xml_path = results[0].path filelists_xml_path = results[1].path other_xml_path = results[2].path metadata_pb.done += 3 metadata_pb.save() packages = await RpmFirstStage.parse_repodata( primary_xml_path, filelists_xml_path, other_xml_path) packages_pb.total = len(packages) packages_pb.state = 'running' packages_pb.save() for pkg in packages.values(): package = Package( **Package.createrepo_to_dict(pkg)) artifact = Artifact(size=package.size_package) checksum_type = getattr( CHECKSUM_TYPES, package.checksum_type.upper()) setattr(artifact, checksum_type, package.pkgId) url = urljoin(remote_url, package.location_href) filename = os.path.basename(package.location_href) da = DeclarativeArtifact( artifact=artifact, url=url, relative_path=filename, remote=self.remote, deferred_download=self.deferred_download) dc = DeclarativeContent(content=package, d_artifacts=[da]) dc.extra_data = defaultdict(list) # find if a package relates to a modulemd if dc.content.nevra in nevra_to_module.keys(): dc.content.is_modular = True for dc_modulemd in nevra_to_module[ dc.content.nevra]: dc.extra_data['modulemd_relation'].append( dc_modulemd) dc_modulemd.extra_data[ 'package_relation'].append(dc) if dc.content.name in pkgname_to_groups.keys(): for dc_group in pkgname_to_groups[ dc.content.name]: dc.extra_data['group_relations'].append( dc_group) dc_group.extra_data[ 'related_packages'].append(dc) packages_pb.increment() await self.put(dc) elif results[0].url == updateinfo_url: updateinfo_xml_path = results[0].path metadata_pb.increment() updates = await RpmFirstStage.parse_updateinfo( updateinfo_xml_path) errata_pb.total = len(updates) errata_pb.state = 'running' errata_pb.save() for update in updates: update_record = UpdateRecord( **UpdateRecord.createrepo_to_dict(update)) update_record.digest = RpmFirstStage.hash_update_record( update) future_relations = { 'collections': defaultdict(list), 'references': [] } for collection in update.collections: coll_dict = UpdateCollection.createrepo_to_dict( collection) coll = UpdateCollection(**coll_dict) for package in collection.packages: pkg_dict = UpdateCollectionPackage.createrepo_to_dict( package) pkg = UpdateCollectionPackage(**pkg_dict) future_relations['collections'][ coll].append(pkg) for reference in update.references: reference_dict = UpdateReference.createrepo_to_dict( reference) ref = UpdateReference(**reference_dict) future_relations['references'].append(ref) errata_pb.increment() dc = DeclarativeContent(content=update_record) dc.extra_data = future_relations await self.put(dc) # now send modules down the pipeline since all relations have been set up for modulemd in modulemd_list: modulemd_pb.increment() await self.put(modulemd) for dc_group in dc_groups: comps_pb.increment() await self.put(dc_group) packages_pb.state = 'completed' errata_pb.state = 'completed' modulemd_pb.state = 'completed' modulemd_defaults_pb.state = 'completed' comps_pb.state = 'completed' packages_pb.save() errata_pb.save() modulemd_pb.save() modulemd_defaults_pb.save() comps_pb.save()