async def create_pulp3_content(self): """ Create a Pulp 3 Advisory content for saving it later in a bulk operation. """ # TODO: figure out # - how to split back merged errata into multiple ones cr_update = {} # Create creterepo_c update record based on pulp2 data relations = {} # TODO: UpdateCollection and UpdateReference # digest = hash_update_record(cr_update) advisory = UpdateRecord(**cr_update) # advisory.digest = digest return advisory, relations
async def _parse_advisories(self, updates): progress_data = { "message": "Parsed Advisories", "code": "parsing.advisories", "total": len(updates), } with ProgressReport(**progress_data) as advisories_pb: for update in updates: update_record = UpdateRecord( **UpdateRecord.createrepo_to_dict(update)) update_record.digest = hash_update_record(update) future_relations = { "collections": defaultdict(list), "references": [] } for collection in update.collections: coll_dict = UpdateCollection.createrepo_to_dict(collection) coll = UpdateCollection(**coll_dict) for package in collection.packages: pkg_dict = UpdateCollectionPackage.createrepo_to_dict( package) pkg = UpdateCollectionPackage(**pkg_dict) future_relations["collections"][coll].append(pkg) for reference in update.references: reference_dict = UpdateReference.createrepo_to_dict( reference) ref = UpdateReference(**reference_dict) future_relations["references"].append(ref) advisories_pb.increment() dc = DeclarativeContent(content=update_record) dc.extra_data = future_relations await self.put(dc)
async def run(self): """ Build `DeclarativeContent` from the repodata. """ with ProgressBar(message='Downloading and Parsing Metadata') as pb: downloader = self.remote.get_downloader( url=urljoin(self.remote.url, 'repodata/repomd.xml') ) # TODO: decide how to distinguish between a mirror list and a normal repo result = await downloader.run() pb.increment() repomd_path = result.path repomd = cr.Repomd(repomd_path) package_repodata_urls = {} downloaders = [] for record in repomd.records: if record.type in PACKAGE_REPODATA: package_repodata_urls[record.type] = urljoin(self.remote.url, record.location_href) elif record.type in UPDATE_REPODATA: updateinfo_url = urljoin(self.remote.url, record.location_href) downloader = self.remote.get_downloader(url=updateinfo_url) downloaders.append([downloader.run()]) else: log.info(_('Unknown repodata type: {t}. Skipped.').format(t=record.type)) # TODO: skip databases, save unknown types to publish them as-is # to preserve order, downloaders are created after all repodata urls are identified package_repodata_downloaders = [] for repodata_type in PACKAGE_REPODATA: downloader = self.remote.get_downloader(url=package_repodata_urls[repodata_type]) package_repodata_downloaders.append(downloader.run()) downloaders.append(package_repodata_downloaders) # asyncio.gather is used to preserve the order of results for package repodata pending = [asyncio.gather(*downloaders_group) for downloaders_group in downloaders] while pending: done, pending = await asyncio.wait(pending, return_when=asyncio.FIRST_COMPLETED) for downloader in done: results = downloader.result() if results[0].url == package_repodata_urls['primary']: primary_xml_path = results[0].path filelists_xml_path = results[1].path other_xml_path = results[2].path pb.done += 3 pb.save() packages = await RpmFirstStage.parse_repodata(primary_xml_path, filelists_xml_path, other_xml_path) for pkg in packages.values(): package = Package(**Package.createrepo_to_dict(pkg)) artifact = Artifact(size=package.size_package) checksum_type = getattr(CHECKSUM_TYPES, package.checksum_type.upper()) setattr(artifact, checksum_type, package.pkgId) url = urljoin(self.remote.url, package.location_href) filename = os.path.basename(package.location_href) da = DeclarativeArtifact( artifact=artifact, url=url, relative_path=filename, remote=self.remote, deferred_download=self.deferred_download ) dc = DeclarativeContent(content=package, d_artifacts=[da]) await self.put(dc) elif results[0].url == updateinfo_url: updateinfo_xml_path = results[0].path pb.increment() updates = await RpmFirstStage.parse_updateinfo(updateinfo_xml_path) for update in updates: update_record = UpdateRecord(**UpdateRecord.createrepo_to_dict(update)) update_record.digest = RpmFirstStage.hash_update_record(update) for collection in update.collections: coll_dict = UpdateCollection.createrepo_to_dict(collection) coll = UpdateCollection(**coll_dict) for package in collection.packages: pkg_dict = UpdateCollectionPackage.createrepo_to_dict(package) pkg = UpdateCollectionPackage(**pkg_dict) coll._packages.append(pkg) update_record._collections.append(coll) for reference in update.references: reference_dict = UpdateReference.createrepo_to_dict(reference) update_record._references.append(UpdateReference(**reference_dict)) dc = DeclarativeContent(content=update_record) await self.put(dc)
def resolve_advisories(version, previous_version): """ Decide which advisories to add to a repo version and which to remove, and adjust a repo version. Advisory can be in 3 different states with relation to a repository version: - in-memory and added before this function call, so it's a part of the current incomplete repository version only - in the db, it's been added in some previous repository version - has no relation to any repository version because it's been created in this function as an outcome of conflict resolution. All 3 states need to be handled differently. The in-db ones and newly created are straightforward, just remove/add in a standard way. To remove in-memory ones (`content_pks_to_exclude`) from an incomplete repo version, one needs to do it directly from RepositoryContent. They've never been a part of a repo version, they are also not among the `content_pks_to_add` or `content_pks_to_remove` ones. Args: version (pulpcore.app.models.RepositoryVersion): current incomplete repository version previous_version (pulpcore.app.models.RepositoryVersion): a version preceding the current incomplete one """ content_pks_to_add = set() content_pks_to_remove = set() content_pks_to_exclude = set( ) # exclude from the set of content which is being added # identify conflicting advisories advisory_pulp_type = UpdateRecord.get_pulp_type() current_advisories = UpdateRecord.objects.filter( pk__in=version.content.filter(pulp_type=advisory_pulp_type)) added_advisories = current_advisories advisory_conflicts = [] # check for any conflict current_ids = [adv.id for adv in current_advisories] if previous_version and len(current_ids) != len(set(current_ids)): previous_advisories = UpdateRecord.objects.filter( pk__in=previous_version.content.filter( pulp_type=advisory_pulp_type)) previous_advisory_ids = set( previous_advisories.values_list('id', flat=True)) # diff for querysets works fine but the result is not fully functional queryset, # e.g. filtering doesn't work added_advisories = current_advisories.difference(previous_advisories) if len(list(added_advisories)) != len(set(added_advisories)): raise AdvisoryConflict( _('It is not possible to add two advisories of the same id to ' 'a repository version.')) added_advisory_ids = set(adv.id for adv in added_advisories) advisory_conflicts = added_advisory_ids.intersection( previous_advisory_ids) added_advisory_pks = [adv.pk for adv in added_advisories] for advisory_id in advisory_conflicts: previous_advisory = previous_advisories.get(id=advisory_id) added_advisory = UpdateRecord.objects.get( id=advisory_id, pk__in=added_advisory_pks) to_add, to_remove, to_exclude = resolve_advisory_conflict( previous_advisory, added_advisory) content_pks_to_add.update(to_add) content_pks_to_remove.update(to_remove) content_pks_to_exclude.update(to_exclude) if content_pks_to_add: version.add_content(Content.objects.filter(pk__in=content_pks_to_add)) if content_pks_to_remove: version.remove_content( Content.objects.filter(pk__in=content_pks_to_remove)) if content_pks_to_exclude: RepositoryContent.objects.filter(repository=version.repository, content_id__in=content_pks_to_exclude, version_added=version).delete()
def create_pulp3_content(self): """ Create a Pulp 3 Advisory content for saving it later in a bulk operation. """ rec = cr.UpdateRecord() rec.fromstr = self.errata_from rec.status = self.status rec.type = self.errata_type rec.version = self.version rec.id = self.errata_id rec.title = self.title rec.issued_date = get_datetime(self.issued) rec.updated_date = get_datetime(self.updated) rec.rights = self.rights rec.summary = self.summary rec.description = self.description rec.reboot_suggested = get_bool(self.reboot_suggested) rec.severity = self.severity rec.solution = self.solution rec.release = self.release rec.pushcount = self.pushcount collections = self.get_collections() for collection in collections: col = cr.UpdateCollection() col.shortname = collection.get('short') col.name = collection.get('name') module = collection.get('module') if module: cr_module = cr.UpdateCollectionModule() cr_module.name = module['name'] cr_module.stream = module['stream'] cr_module.version = int(module['version']) cr_module.context = module['context'] cr_module.arch = module['arch'] col.module = cr_module for package in collection.get('packages', []): pkg = cr.UpdateCollectionPackage() pkg.name = package['name'] pkg.version = package['version'] pkg.release = package['release'] pkg.epoch = package['epoch'] pkg.arch = package['arch'] pkg.src = package.get('src') pkg.filename = package['filename'] pkg.reboot_suggested = get_bool(package.get('reboot_suggested')) pkg.restart_suggested = get_bool(package.get('restart_suggested')) pkg.relogin_suggested = get_bool(package.get('relogin_suggested')) checksum_tuple = get_package_checksum(package) if checksum_tuple: pkg.sum_type, pkg.sum = checksum_tuple col.append(pkg) rec.append_collection(col) for reference in self.references: ref = cr.UpdateReference() ref.href = reference.get('href') ref.id = reference.get('id') ref.type = reference.get('type') ref.title = reference.get('title') rec.append_reference(ref) update_record = UpdateRecord(**UpdateRecord.createrepo_to_dict(rec)) update_record.digest = hash_update_record(rec) relations = {'collections': defaultdict(list), 'references': []} for collection in rec.collections: coll_dict = UpdateCollection.createrepo_to_dict(collection) coll = UpdateCollection(**coll_dict) for package in collection.packages: pkg_dict = UpdateCollectionPackage.createrepo_to_dict(package) pkg = UpdateCollectionPackage(**pkg_dict) relations['collections'][coll].append(pkg) for reference in rec.references: reference_dict = UpdateReference.createrepo_to_dict(reference) ref = UpdateReference(**reference_dict) relations['references'].append(ref) return (update_record, relations)
def find_children_of_content(content, src_repo_version): """Finds the content referenced directly by other content and returns it all together. Finds RPMs referenced by Advisory/Errata content. Args: content (iterable): Content for which to resolve children src_repo_version (pulpcore.models.RepositoryVersion): Source repo version Returns: Queryset of Content objects that are children of the intial set of content """ # Content that were selected to be copied advisory_ids = content.filter( pulp_type=UpdateRecord.get_pulp_type()).only('pk') packagecategory_ids = content.filter( pulp_type=PackageCategory.get_pulp_type()).only('pk') packageenvironment_ids = content.filter( pulp_type=PackageEnvironment.get_pulp_type()).only('pk') packagegroup_ids = content.filter( pulp_type=PackageGroup.get_pulp_type()).only('pk') # Content in the source repository version package_ids = src_repo_version.content.filter( pulp_type=Package.get_pulp_type()).only('pk') module_ids = src_repo_version.content.filter( pulp_type=Modulemd.get_pulp_type()).only('pk') advisories = UpdateRecord.objects.filter(pk__in=advisory_ids) packages = Package.objects.filter(pk__in=package_ids) packagecategories = PackageCategory.objects.filter( pk__in=packagecategory_ids) packageenvironments = PackageEnvironment.objects.filter( pk__in=packageenvironment_ids) packagegroups = PackageGroup.objects.filter(pk__in=packagegroup_ids) modules = Modulemd.objects.filter(pk__in=module_ids) children = set() for advisory in advisories: # Find rpms referenced by Advisories/Errata package_nevras = advisory.get_pkglist() for nevra in package_nevras: (name, epoch, version, release, arch) = nevra try: package = packages.get(name=name, epoch=epoch, version=version, release=release, arch=arch) children.add(package.pk) except Package.DoesNotExist: raise except MultipleObjectsReturned: raise module_nsvcas = advisory.get_module_list() for nsvca in module_nsvcas: (name, stream, version, context, arch) = nsvca try: module = modules.get(name=name, stream=stream, version=version, context=context, arch=arch) children.add(module.pk) except Modulemd.DoesNotExist: raise except MultipleObjectsReturned: raise # PackageCategories & PackageEnvironments resolution must go before PackageGroups # TODO: refactor to be more effecient (lower number of queries) for packagecategory in packagecategories.iterator(): for category_package_group in packagecategory.group_ids: category_package_groups = PackageGroup.objects.filter( name=category_package_group['name'], pk__in=src_repo_version.content) children.update( [pkggroup.pk for pkggroup in category_package_groups]) packagegroups = packagegroups.union(category_package_groups) for packageenvironment in packageenvironments.iterator(): for env_package_group in packageenvironment.group_ids: env_package_groups = PackageGroup.objects.filter( name=env_package_group['name'], pk__in=src_repo_version.content) children.update([envgroup.pk for envgroup in env_package_groups]) packagegroups = packagegroups.union(env_package_groups) for optional_env_package_group in packageenvironment.option_ids: opt_env_package_groups = PackageGroup.objects.filter( name=optional_env_package_group['name'], pk__in=src_repo_version.content) children.update( [optpkggroup.pk for optpkggroup in opt_env_package_groups]) packagegroups = packagegroups.union(opt_env_package_groups) # Find rpms referenced by PackageGroups for packagegroup in packagegroups.iterator(): group_package_names = [pkg['name'] for pkg in packagegroup.packages] for pkg in group_package_names: packages_by_name = [ pkg for pkg in Package.objects.with_age().filter( name=pkg, pk__in=src_repo_version.content) if pkg.age == 1 ] for pkg in packages_by_name: children.add(pkg.pk) return Content.objects.filter(pk__in=children)
async def run(self): """ Build `DeclarativeContent` from the repodata. """ remote_url = self.new_url or self.remote.url remote_url = remote_url if remote_url[-1] == "/" else f"{remote_url}/" optimize_sync = self.optimize progress_data = dict(message='Downloading Metadata Files', code='downloading.metadata') with ProgressReport(**progress_data) as metadata_pb: downloader = self.remote.get_downloader( url=urljoin(remote_url, 'repodata/repomd.xml')) # TODO: decide how to distinguish between a mirror list and a normal repo result = await downloader.run() metadata_pb.increment() repomd_path = result.path repomd = cr.Repomd(repomd_path) # Caution: we are not storing when the remote was last updated, so the order of this # logic must remain in this order where we first check the version number as other # changes than sync could have taken place such that the date or repo version will be # different from last sync if (optimize_sync and self.repository.last_sync_remote and self.remote.pk == self.repository.last_sync_remote.pk and (self.repository.last_sync_repo_version == self.repository.latest_version().number) and (self.remote.pulp_last_updated <= self.repository.latest_version().pulp_created) and is_previous_version( repomd.revision, self.repository.last_sync_revision_number)): optimize_data = dict(message='Optimizing Sync', code='optimizing.sync') with ProgressReport(**optimize_data) as optimize_pb: optimize_pb.done = 1 optimize_pb.save() return self.repository.last_sync_revision_number = repomd.revision if self.treeinfo: d_artifacts = [ DeclarativeArtifact( artifact=Artifact(), url=urljoin(remote_url, self.treeinfo["filename"]), relative_path=".treeinfo", remote=self.remote, deferred_download=False, ) ] for path, checksum in self.treeinfo["download"][ "images"].items(): artifact = Artifact(**checksum) da = DeclarativeArtifact( artifact=artifact, url=urljoin(remote_url, path), relative_path=path, remote=self.remote, deferred_download=self.deferred_download) d_artifacts.append(da) distribution_tree = DistributionTree( **self.treeinfo["distribution_tree"]) dc = DeclarativeContent(content=distribution_tree, d_artifacts=d_artifacts) dc.extra_data = self.treeinfo await self.put(dc) package_repodata_urls = {} downloaders = [] modulemd_list = list() dc_groups = [] dc_categories = [] dc_environments = [] nevra_to_module = defaultdict(dict) pkgname_to_groups = defaultdict(list) group_to_categories = defaultdict(list) group_to_environments = defaultdict(list) optionalgroup_to_environments = defaultdict(list) modulemd_results = None comps_downloader = None main_types = set() checksums = {} for record in repomd.records: checksums[record.type] = record.checksum_type.upper() if record.type in PACKAGE_REPODATA: main_types.update([record.type]) package_repodata_urls[record.type] = urljoin( remote_url, record.location_href) elif record.type in UPDATE_REPODATA: updateinfo_url = urljoin(remote_url, record.location_href) downloader = self.remote.get_downloader(url=updateinfo_url) downloaders.append([downloader.run()]) elif record.type in COMPS_REPODATA: comps_url = urljoin(remote_url, record.location_href) comps_downloader = self.remote.get_downloader( url=comps_url) elif record.type in SKIP_REPODATA: continue elif '_zck' in record.type: continue elif record.type in MODULAR_REPODATA: modules_url = urljoin(remote_url, record.location_href) modulemd_downloader = self.remote.get_downloader( url=modules_url) modulemd_results = await modulemd_downloader.run() elif record.type not in PACKAGE_DB_REPODATA: file_data = { record.checksum_type: record.checksum, "size": record.size } da = DeclarativeArtifact( artifact=Artifact(**file_data), url=urljoin(remote_url, record.location_href), relative_path=record.location_href, remote=self.remote, deferred_download=False) repo_metadata_file = RepoMetadataFile( data_type=record.type, checksum_type=record.checksum_type, checksum=record.checksum, ) dc = DeclarativeContent(content=repo_metadata_file, d_artifacts=[da]) await self.put(dc) missing_type = set(PACKAGE_REPODATA) - main_types if missing_type: raise FileNotFoundError( _("XML file(s): {filename} not found").format( filename=", ".join(missing_type))) self.repository.original_checksum_types = checksums # we have to sync module.yaml first if it exists, to make relations to packages if modulemd_results: modulemd_index = mmdlib.ModuleIndex.new() open_func = gzip.open if modulemd_results.url.endswith( '.gz') else open with open_func(modulemd_results.path, 'r') as moduleyaml: content = moduleyaml.read() module_content = content if isinstance( content, str) else content.decode() modulemd_index.update_from_string(module_content, True) modulemd_names = modulemd_index.get_module_names() or [] modulemd_all = parse_modulemd(modulemd_names, modulemd_index) # Parsing modules happens all at one time, and from here on no useful work happens. # So just report that it finished this stage. modulemd_pb_data = { 'message': 'Parsed Modulemd', 'code': 'parsing.modulemds' } with ProgressReport(**modulemd_pb_data) as modulemd_pb: modulemd_total = len(modulemd_all) modulemd_pb.total = modulemd_total modulemd_pb.done = modulemd_total for modulemd in modulemd_all: artifact = modulemd.pop('artifact') relative_path = '{}{}{}{}{}snippet'.format( modulemd[PULP_MODULE_ATTR.NAME], modulemd[PULP_MODULE_ATTR.STREAM], modulemd[PULP_MODULE_ATTR.VERSION], modulemd[PULP_MODULE_ATTR.CONTEXT], modulemd[PULP_MODULE_ATTR.ARCH]) da = DeclarativeArtifact(artifact=artifact, relative_path=relative_path, url=modules_url) modulemd_content = Modulemd(**modulemd) dc = DeclarativeContent(content=modulemd_content, d_artifacts=[da]) dc.extra_data = defaultdict(list) # dc.content.artifacts are Modulemd artifacts for artifact in dc.content.artifacts: nevra_to_module.setdefault(artifact, set()).add(dc) modulemd_list.append(dc) # delete list now that we're done with it for memory savings del modulemd_all modulemd_default_names = parse_defaults(modulemd_index) # Parsing module-defaults happens all at one time, and from here on no useful # work happens. So just report that it finished this stage. modulemd_defaults_pb_data = { 'message': 'Parsed Modulemd-defaults', 'code': 'parsing.modulemd_defaults' } with ProgressReport( **modulemd_defaults_pb_data) as modulemd_defaults_pb: modulemd_defaults_total = len(modulemd_default_names) modulemd_defaults_pb.total = modulemd_defaults_total modulemd_defaults_pb.done = modulemd_defaults_total for default in modulemd_default_names: artifact = default.pop('artifact') relative_path = '{}{}snippet'.format( default[PULP_MODULEDEFAULTS_ATTR.MODULE], default[PULP_MODULEDEFAULTS_ATTR.STREAM]) da = DeclarativeArtifact(artifact=artifact, relative_path=relative_path, url=modules_url) default_content = ModulemdDefaults(**default) dc = DeclarativeContent(content=default_content, d_artifacts=[da]) await self.put(dc) # delete list now that we're done with it for memory savings del modulemd_default_names if comps_downloader: comps_result = await comps_downloader.run() comps = libcomps.Comps() comps.fromxml_f(comps_result.path) with ProgressReport(message='Parsed Comps', code='parsing.comps') as comps_pb: comps_total = (len(comps.groups) + len(comps.categories) + len(comps.environments)) comps_pb.total = comps_total comps_pb.done = comps_total if comps.langpacks: langpack_dict = PackageLangpacks.libcomps_to_dict( comps.langpacks) packagelangpack = PackageLangpacks( matches=strdict_to_dict(comps.langpacks), digest=dict_digest(langpack_dict)) dc = DeclarativeContent(content=packagelangpack) dc.extra_data = defaultdict(list) await self.put(dc) if comps.categories: for category in comps.categories: category_dict = PackageCategory.libcomps_to_dict( category) category_dict['digest'] = dict_digest(category_dict) packagecategory = PackageCategory(**category_dict) dc = DeclarativeContent(content=packagecategory) dc.extra_data = defaultdict(list) if packagecategory.group_ids: for group_id in packagecategory.group_ids: group_to_categories[group_id['name']].append( dc) dc_categories.append(dc) if comps.environments: for environment in comps.environments: environment_dict = PackageEnvironment.libcomps_to_dict( environment) environment_dict['digest'] = dict_digest( environment_dict) packageenvironment = PackageEnvironment( **environment_dict) dc = DeclarativeContent(content=packageenvironment) dc.extra_data = defaultdict(list) if packageenvironment.option_ids: for option_id in packageenvironment.option_ids: optionalgroup_to_environments[ option_id['name']].append(dc) if packageenvironment.group_ids: for group_id in packageenvironment.group_ids: group_to_environments[group_id['name']].append( dc) dc_environments.append(dc) if comps.groups: for group in comps.groups: group_dict = PackageGroup.libcomps_to_dict(group) group_dict['digest'] = dict_digest(group_dict) packagegroup = PackageGroup(**group_dict) dc = DeclarativeContent(content=packagegroup) dc.extra_data = defaultdict(list) if packagegroup.packages: for package in packagegroup.packages: pkgname_to_groups[package['name']].append(dc) if dc.content.id in group_to_categories.keys(): for dc_category in group_to_categories[ dc.content.id]: dc.extra_data['category_relations'].append( dc_category) dc_category.extra_data['packagegroups'].append( dc) if dc.content.id in group_to_environments.keys(): for dc_environment in group_to_environments[ dc.content.id]: dc.extra_data['environment_relations'].append( dc_environment) dc_environment.extra_data[ 'packagegroups'].append(dc) if dc.content.id in optionalgroup_to_environments.keys( ): for dc_environment in optionalgroup_to_environments[ dc.content.id]: dc.extra_data['env_relations_optional'].append( dc_environment) dc_environment.extra_data[ 'optionalgroups'].append(dc) dc_groups.append(dc) for dc_category in dc_categories: await self.put(dc_category) for dc_environment in dc_environments: await self.put(dc_environment) # delete lists now that we're done with them for memory savings del dc_environments del dc_categories # to preserve order, downloaders are created after all repodata urls are identified package_repodata_downloaders = [] for repodata_type in PACKAGE_REPODATA: downloader = self.remote.get_downloader( url=package_repodata_urls[repodata_type]) package_repodata_downloaders.append(downloader.run()) downloaders.append(package_repodata_downloaders) # asyncio.gather is used to preserve the order of results for package repodata pending = [ asyncio.gather(*downloaders_group) for downloaders_group in downloaders ] while pending: done, pending = await asyncio.wait( pending, return_when=asyncio.FIRST_COMPLETED) for downloader in done: try: results = downloader.result() except ClientResponseError as exc: raise HTTPNotFound( reason=_("File not found: {filename}").format( filename=exc.request_info.url)) if results[0].url == package_repodata_urls['primary']: primary_xml_path = results[0].path filelists_xml_path = results[1].path other_xml_path = results[2].path metadata_pb.done += 3 metadata_pb.save() packages = await RpmFirstStage.parse_repodata( primary_xml_path, filelists_xml_path, other_xml_path) # skip SRPM if defined if 'srpm' in self.skip_types: packages = { pkgId: pkg for pkgId, pkg in packages.items() if pkg.arch != 'src' } progress_data = { 'message': 'Parsed Packages', 'code': 'parsing.packages', 'total': len(packages), } with ProgressReport(**progress_data) as packages_pb: for pkg in packages.values(): package = Package( **Package.createrepo_to_dict(pkg)) artifact = Artifact(size=package.size_package) checksum_type = getattr( CHECKSUM_TYPES, package.checksum_type.upper()) setattr(artifact, checksum_type, package.pkgId) url = urljoin(remote_url, package.location_href) filename = os.path.basename( package.location_href) da = DeclarativeArtifact( artifact=artifact, url=url, relative_path=filename, remote=self.remote, deferred_download=self.deferred_download) dc = DeclarativeContent(content=package, d_artifacts=[da]) dc.extra_data = defaultdict(list) # find if a package relates to a modulemd if dc.content.nevra in nevra_to_module.keys(): dc.content.is_modular = True for dc_modulemd in nevra_to_module[ dc.content.nevra]: dc.extra_data[ 'modulemd_relation'].append( dc_modulemd) dc_modulemd.extra_data[ 'package_relation'].append(dc) if dc.content.name in pkgname_to_groups.keys(): for dc_group in pkgname_to_groups[ dc.content.name]: dc.extra_data[ 'group_relations'].append(dc_group) dc_group.extra_data[ 'related_packages'].append(dc) packages_pb.increment() await self.put(dc) elif results[0].url == updateinfo_url: updateinfo_xml_path = results[0].path metadata_pb.increment() updates = await RpmFirstStage.parse_updateinfo( updateinfo_xml_path) progress_data = { 'message': 'Parsed Advisories', 'code': 'parsing.advisories', 'total': len(updates), } with ProgressReport(**progress_data) as advisories_pb: for update in updates: update_record = UpdateRecord( **UpdateRecord.createrepo_to_dict(update)) update_record.digest = hash_update_record( update) future_relations = { 'collections': defaultdict(list), 'references': [] } for collection in update.collections: coll_dict = UpdateCollection.createrepo_to_dict( collection) coll = UpdateCollection(**coll_dict) for package in collection.packages: pkg_dict = UpdateCollectionPackage.createrepo_to_dict( package) pkg = UpdateCollectionPackage( **pkg_dict) future_relations['collections'][ coll].append(pkg) for reference in update.references: reference_dict = UpdateReference.createrepo_to_dict( reference) ref = UpdateReference(**reference_dict) future_relations['references'].append(ref) advisories_pb.increment() dc = DeclarativeContent(content=update_record) dc.extra_data = future_relations await self.put(dc) # now send modules down the pipeline since all relations have been set up for modulemd in modulemd_list: await self.put(modulemd) for dc_group in dc_groups: await self.put(dc_group)
def resolve_advisories(version, previous_version): """ Decide which advisories to add to a repo version and which to remove, and adjust a repo version. Args: version (pulpcore.app.models.RepositoryVersion): current incomplete repository version previous_version (pulpcore.app.models.RepositoryVersion): a version preceding the current incomplete one """ content_pks_to_add = set() content_pks_to_remove = set() content_pks_to_exclude = set( ) # exclude from the set of content which is being added # identify conflicting advisories advisory_pulp_type = UpdateRecord.get_pulp_type() current_advisories = UpdateRecord.objects.filter( pk__in=version.content.filter(pulp_type=advisory_pulp_type)) added_advisories = current_advisories advisory_conflicts = [] # check for IDs if any conflict # e.g. mirror mode can already removed advisories with same ID current_ids = [adv.id for adv in current_advisories] if previous_version and len(current_ids) != len(set(current_ids)): previous_advisories = UpdateRecord.objects.filter( pk__in=previous_version.content.filter( pulp_type=advisory_pulp_type)) previous_advisory_ids = set( previous_advisories.values_list('id', flat=True)) # diff for querysets works fine but the result is not fully functional queryset, # e.g. filtering doesn't work added_advisories = current_advisories.difference(previous_advisories) if len(list(added_advisories)) != len(set(added_advisories)): raise AdvisoryConflict( _('It is not possible to add two advisories of the same id to ' 'a repository version.')) added_advisory_ids = set(adv.id for adv in added_advisories) advisory_conflicts = added_advisory_ids.intersection( previous_advisory_ids) added_advisory_pks = [adv.pk for adv in added_advisories] for advisory_id in advisory_conflicts: previous_advisory = previous_advisories.get(id=advisory_id) added_advisory = UpdateRecord.objects.get( id=advisory_id, pk__in=added_advisory_pks) to_add, to_remove, to_exclude = resolve_advisory_conflict( previous_advisory, added_advisory) content_pks_to_add.update(to_add) content_pks_to_remove.update(to_remove) content_pks_to_exclude.update(to_exclude) if content_pks_to_add: version.add_content(Content.objects.filter(pk__in=content_pks_to_add)) if content_pks_to_remove: version.remove_content( Content.objects.filter(pk__in=content_pks_to_remove)) if content_pks_to_exclude: RepositoryContent.objects.filter(repository=version.repository, content_id__in=content_pks_to_exclude, version_added=version).delete()
def find_children_of_content(content, src_repo_version): """Finds the content referenced directly by other content and returns it all together. Finds RPMs referenced by Advisory/Errata content. Args: content (Queryset): Content for which to resolve children src_repo_version (pulpcore.models.RepositoryVersion): Source repo version Returns: Queryset of Content objects that are children of the intial set of content """ # Content that were selected to be copied advisory_ids = content.filter( pulp_type=UpdateRecord.get_pulp_type()).only("pk") packagecategory_ids = content.filter( pulp_type=PackageCategory.get_pulp_type()).only("pk") packageenvironment_ids = content.filter( pulp_type=PackageEnvironment.get_pulp_type()).only("pk") packagegroup_ids = content.filter( pulp_type=PackageGroup.get_pulp_type()).only("pk") # Content in the source repository version package_ids = src_repo_version.content.filter( pulp_type=Package.get_pulp_type()).only("pk") module_ids = src_repo_version.content.filter( pulp_type=Modulemd.get_pulp_type()).only("pk") advisories = UpdateRecord.objects.filter(pk__in=advisory_ids) packages = Package.objects.filter(pk__in=package_ids) packagecategories = PackageCategory.objects.filter( pk__in=packagecategory_ids) packageenvironments = PackageEnvironment.objects.filter( pk__in=packageenvironment_ids) packagegroups = PackageGroup.objects.filter(pk__in=packagegroup_ids) modules = Modulemd.objects.filter(pk__in=module_ids) children = set() for advisory in advisories.iterator(): # Find rpms referenced by Advisories/Errata package_nevras = advisory.get_pkglist() advisory_package_q = Q(pk__in=[]) for nevra in package_nevras: (name, epoch, version, release, arch) = nevra advisory_package_q |= Q(name=name, epoch=epoch, version=version, release=release, arch=arch) children.update( packages.filter(advisory_package_q).values_list("pk", flat=True)) module_nsvcas = advisory.get_module_list() advisory_module_q = Q(pk__in=[]) for nsvca in module_nsvcas: (name, stream, version, context, arch) = nsvca advisory_module_q |= Q(name=name, stream=stream, version=version, context=context, arch=arch) children.update( modules.filter(advisory_module_q).values_list("pk", flat=True)) # PackageCategories & PackageEnvironments resolution must go before PackageGroups packagegroup_names = set() for packagecategory in packagecategories.iterator(): for group_id in packagecategory.group_ids: packagegroup_names.add(group_id["name"]) for packageenvironment in packageenvironments.iterator(): for group_id in packageenvironment.group_ids: packagegroup_names.add(group_id["name"]) for group_id in packageenvironment.option_ids: packagegroup_names.add(group_id["name"]) child_package_groups = PackageGroup.objects.filter( name__in=packagegroup_names, pk__in=src_repo_version.content) children.update([pkggroup.pk for pkggroup in child_package_groups]) packagegroups = packagegroups.union(child_package_groups) # Find rpms referenced by PackageGroups packagegroup_package_names = set() for packagegroup in packagegroups.iterator(): packagegroup_package_names |= set(pkg["name"] for pkg in packagegroup.packages) # TODO: do modular/nonmodular need to be taken into account? existing_package_names = (Package.objects.filter( name__in=packagegroup_package_names, pk__in=content, ).values_list("name", flat=True).distinct()) missing_package_names = packagegroup_package_names - set( existing_package_names) needed_packages = Package.objects.with_age().filter( name__in=missing_package_names, pk__in=src_repo_version.content) # Pick the latest version of each package available which isn't already present # in the content set. for pkg in needed_packages.iterator(): if pkg.age == 1: children.add(pkg.pk) return Content.objects.filter(pk__in=children)
def resolve_advisories(version, previous_version): """ Decide which advisories to add to a repo version and which to remove, and adjust a repo version. Advisory can be in 3 different states with relation to a repository version: - in-memory and added before this function call, so it's a part of the current incomplete repository version only - in the db, it's been added in some previous repository version - has no relation to any repository version because it's been created in this function as an outcome of conflict resolution. All 3 states need to be handled differently. The in-db ones and newly created are straightforward, just remove/add in a standard way. To remove in-memory ones (`content_pks_to_exclude`) from an incomplete repo version, one needs to do it directly from RepositoryContent. They've never been a part of a repo version, they are also not among the `content_pks_to_add` or `content_pks_to_remove` ones. Args: version (pulpcore.app.models.RepositoryVersion): current incomplete repository version previous_version (pulpcore.app.models.RepositoryVersion): a version preceding the current incomplete one """ # identify conflicting advisories advisory_pulp_type = UpdateRecord.get_pulp_type() current_advisories = UpdateRecord.objects.filter( pk__in=version.content.filter(pulp_type=advisory_pulp_type)) # check for any conflict unique_advisory_ids = {adv.id for adv in current_advisories} if len(current_advisories) == len(unique_advisory_ids): # no conflicts return current_advisories_by_id = defaultdict(list) for advisory in current_advisories: current_advisories_by_id[advisory.id].append(advisory) if previous_version: previous_advisories = UpdateRecord.objects.filter( pk__in=previous_version.content.filter( pulp_type=advisory_pulp_type)) previous_advisory_ids = set( previous_advisories.values_list("id", flat=True)) # diff for querysets works fine but the result is not fully functional queryset, # e.g. filtering doesn't work added_advisories = current_advisories.difference(previous_advisories) added_advisories_by_id = defaultdict(list) for advisory in added_advisories: added_advisories_by_id[advisory.id].append(advisory) else: previous_advisory_ids = set() added_advisories = current_advisories added_advisories_by_id = current_advisories_by_id # Conflicts can be in different places and behaviour differs based on that. # `in_added`, when conflict happens in the added advisories, this is not allowed and # should fail. # `added_vs_previous`, a standard conflict between an advisory which is being added and the one # in the preceding repo version. This should be resolved according to the heuristics, # unless previous repo version has conflicts. In the latter case, the added advisory is picked. advisory_id_conflicts = {"in_added": [], "added_vs_previous": []} for advisory_id, advisories in current_advisories_by_id.items(): # we are only interested in conflicts where added advisory is present, we are not trying # to fix old conflicts in the existing repo version. There is no real harm in htose, # just confusing. if len(advisories) > 1 and advisory_id in added_advisories_by_id: # if the conflict is in added advisories (2+ advisories with the same id are being # added), we need to collect such ids to fail later with # a list of all conflicting advisories. No other processing of those is needed. if len(added_advisories_by_id[advisory_id]) > 1: advisory_id_conflicts["in_added"].append(advisory_id) # a standard conflict is detected elif advisory_id in previous_advisory_ids: advisory_id_conflicts["added_vs_previous"].append(advisory_id) if advisory_id_conflicts["in_added"]: raise AdvisoryConflict( _("It is not possible to add more than one advisory with the same id to a " "repository version. Affected advisories: {}.".format(",".join( advisory_id_conflicts["in_added"])))) content_pks_to_add = set() content_pks_to_remove = set() content_pks_to_exclude = set( ) # exclude from the set of content which is being added if advisory_id_conflicts["added_vs_previous"]: for advisory_id in advisory_id_conflicts["added_vs_previous"]: previous_advisory_qs = previous_advisories.filter(id=advisory_id) # there can only be one added advisory at this point otherwise the AdvisoryConflict # would have been raised by now added_advisory = added_advisories_by_id[advisory_id][0] added_advisory.touch() if previous_advisory_qs.count() > 1: # due to an old bug there could be N advisories with the same id in a repo, # this is wrong and there may not be a good way to resolve those, so let's take a # new one. content_pks_to_add.update([added_advisory.pk]) content_pks_to_remove.update( [adv.pk for adv in previous_advisory_qs]) else: to_add, to_remove, to_exclude = resolve_advisory_conflict( previous_advisory_qs.first(), added_advisory) content_pks_to_add.update(to_add) content_pks_to_remove.update(to_remove) content_pks_to_exclude.update(to_exclude) if content_pks_to_add: version.add_content(Content.objects.filter(pk__in=content_pks_to_add)) if content_pks_to_remove: version.remove_content( Content.objects.filter(pk__in=content_pks_to_remove)) if content_pks_to_exclude: RepositoryContent.objects.filter( repository=version.repository, content_id__in=content_pks_to_exclude, version_added=version, ).delete()
async def run(self): """ Build `DeclarativeContent` from the repodata. """ packages_pb = ProgressBar(message='Parsed Packages') erratum_pb = ProgressBar(message='Parsed Erratum') packages_pb.save() erratum_pb.save() with ProgressBar(message='Downloading Metadata Files') as metadata_pb: downloader = self.remote.get_downloader( url=urljoin(self.remote.url, 'repodata/repomd.xml')) # TODO: decide how to distinguish between a mirror list and a normal repo result = await downloader.run() metadata_pb.increment() repomd_path = result.path repomd = cr.Repomd(repomd_path) package_repodata_urls = {} downloaders = [] for record in repomd.records: if record.type in PACKAGE_REPODATA: package_repodata_urls[record.type] = urljoin( self.remote.url, record.location_href) elif record.type in UPDATE_REPODATA: updateinfo_url = urljoin(self.remote.url, record.location_href) downloader = self.remote.get_downloader(url=updateinfo_url) downloaders.append([downloader.run()]) else: log.info( _('Unknown repodata type: {t}. Skipped.').format( t=record.type)) # TODO: skip databases, save unknown types to publish them as-is # to preserve order, downloaders are created after all repodata urls are identified package_repodata_downloaders = [] for repodata_type in PACKAGE_REPODATA: downloader = self.remote.get_downloader( url=package_repodata_urls[repodata_type]) package_repodata_downloaders.append(downloader.run()) downloaders.append(package_repodata_downloaders) # asyncio.gather is used to preserve the order of results for package repodata pending = [ asyncio.gather(*downloaders_group) for downloaders_group in downloaders ] while pending: done, pending = await asyncio.wait( pending, return_when=asyncio.FIRST_COMPLETED) for downloader in done: results = downloader.result() if results[0].url == package_repodata_urls['primary']: primary_xml_path = results[0].path filelists_xml_path = results[1].path other_xml_path = results[2].path metadata_pb.done += 3 metadata_pb.save() packages = await RpmFirstStage.parse_repodata( primary_xml_path, filelists_xml_path, other_xml_path) packages_pb.total = len(packages) packages_pb.state = 'running' packages_pb.save() for pkg in packages.values(): package = Package( **Package.createrepo_to_dict(pkg)) artifact = Artifact(size=package.size_package) checksum_type = getattr( CHECKSUM_TYPES, package.checksum_type.upper()) setattr(artifact, checksum_type, package.pkgId) url = urljoin(self.remote.url, package.location_href) filename = os.path.basename(package.location_href) da = DeclarativeArtifact( artifact=artifact, url=url, relative_path=filename, remote=self.remote, deferred_download=self.deferred_download) dc = DeclarativeContent(content=package, d_artifacts=[da]) packages_pb.increment() await self.put(dc) elif results[0].url == updateinfo_url: updateinfo_xml_path = results[0].path metadata_pb.increment() updates = await RpmFirstStage.parse_updateinfo( updateinfo_xml_path) erratum_pb.total = len(updates) erratum_pb.state = 'running' erratum_pb.save() for update in updates: update_record = UpdateRecord( **UpdateRecord.createrepo_to_dict(update)) update_record.digest = RpmFirstStage.hash_update_record( update) future_relations = { 'collections': defaultdict(list), 'references': [] } for collection in update.collections: coll_dict = UpdateCollection.createrepo_to_dict( collection) coll = UpdateCollection(**coll_dict) for package in collection.packages: pkg_dict = UpdateCollectionPackage.createrepo_to_dict( package) pkg = UpdateCollectionPackage(**pkg_dict) future_relations['collections'][ coll].append(pkg) for reference in update.references: reference_dict = UpdateReference.createrepo_to_dict( reference) ref = UpdateReference(**reference_dict) future_relations['references'].append(ref) erratum_pb.increment() dc = DeclarativeContent(content=update_record) dc.extra_data = future_relations await self.put(dc) packages_pb.state = 'completed' erratum_pb.state = 'completed' packages_pb.save() erratum_pb.save()
async def run(self): """ Build `DeclarativeContent` from the repodata. """ packages_pb = ProgressReport(message='Parsed Packages', code='parsing.packages') errata_pb = ProgressReport(message='Parsed Erratum', code='parsing.errata') modulemd_pb = ProgressReport(message='Parse Modulemd', code='parsing.modulemds') modulemd_defaults_pb = ProgressReport( message='Parse Modulemd-defaults', code='parsing.modulemddefaults') comps_pb = ProgressReport(message='Parsed Comps', code='parsing.comps') packages_pb.save() errata_pb.save() comps_pb.save() remote_url = self.new_url or self.remote.url remote_url = remote_url if remote_url[-1] == "/" else f"{remote_url}/" progress_data = dict(message='Downloading Metadata Files', code='downloading.metadata') with ProgressReport(**progress_data) as metadata_pb: downloader = self.remote.get_downloader( url=urljoin(remote_url, 'repodata/repomd.xml')) # TODO: decide how to distinguish between a mirror list and a normal repo result = await downloader.run() metadata_pb.increment() if self.kickstart: d_artifacts = [] for path, checksum in self.kickstart["download"][ "images"].items(): artifact = Artifact(**checksum) da = DeclarativeArtifact( artifact=artifact, url=urljoin(remote_url, path), relative_path=path, remote=self.remote, deferred_download=self.deferred_download) d_artifacts.append(da) distribution_tree = DistributionTree( **self.kickstart["distribution_tree"]) dc = DeclarativeContent(content=distribution_tree, d_artifacts=d_artifacts) dc.extra_data = self.kickstart await self.put(dc) repomd_path = result.path repomd = cr.Repomd(repomd_path) package_repodata_urls = {} downloaders = [] modulemd_list = list() dc_groups = [] dc_categories = [] dc_environments = [] nevra_to_module = defaultdict(dict) pkgname_to_groups = defaultdict(list) group_to_categories = defaultdict(list) group_to_environments = defaultdict(list) optionalgroup_to_environments = defaultdict(list) modulemd_results = None comps_downloader = None for record in repomd.records: if record.type in PACKAGE_REPODATA: package_repodata_urls[record.type] = urljoin( remote_url, record.location_href) elif record.type in UPDATE_REPODATA: updateinfo_url = urljoin(remote_url, record.location_href) downloader = self.remote.get_downloader(url=updateinfo_url) downloaders.append([downloader.run()]) elif record.type in COMPS_REPODATA: comps_url = urljoin(remote_url, record.location_href) comps_downloader = self.remote.get_downloader( url=comps_url) elif record.type in SKIP_REPODATA: continue elif record.type in MODULAR_REPODATA: modules_url = urljoin(remote_url, record.location_href) modulemd_downloader = self.remote.get_downloader( url=modules_url) modulemd_results = await modulemd_downloader.run() elif record.type not in PACKAGE_DB_REPODATA: file_data = { record.checksum_type: record.checksum, "size": record.size } da = DeclarativeArtifact( artifact=Artifact(**file_data), url=urljoin(remote_url, record.location_href), relative_path=record.location_href, remote=self.remote, deferred_download=False) repo_metadata_file = RepoMetadataFile( data_type=record.type, checksum_type=record.checksum_type, checksum=record.checksum, ) dc = DeclarativeContent(content=repo_metadata_file, d_artifacts=[da]) await self.put(dc) # we have to sync module.yaml first if it exists, to make relations to packages if modulemd_results: modulemd_index = mmdlib.ModuleIndex.new() open_func = gzip.open if modulemd_results.url.endswith( '.gz') else open with open_func(modulemd_results.path, 'r') as moduleyaml: modulemd_index.update_from_string( moduleyaml.read().decode(), True) modulemd_names = modulemd_index.get_module_names() or [] modulemd_all = parse_modulemd(modulemd_names, modulemd_index) modulemd_pb.total = len(modulemd_all) modulemd_pb.state = 'running' modulemd_pb.save() for modulemd in modulemd_all: artifact = modulemd.pop('artifact') relative_path = '{}{}{}{}{}snippet'.format( modulemd[PULP_MODULE_ATTR.NAME], modulemd[PULP_MODULE_ATTR.STREAM], modulemd[PULP_MODULE_ATTR.VERSION], modulemd[PULP_MODULE_ATTR.CONTEXT], modulemd[PULP_MODULE_ATTR.ARCH]) da = DeclarativeArtifact(artifact=artifact, relative_path=relative_path, url=modules_url) modulemd_content = Modulemd(**modulemd) dc = DeclarativeContent(content=modulemd_content, d_artifacts=[da]) dc.extra_data = defaultdict(list) # dc.content.artifacts are Modulemd artifacts for artifact in json.loads(dc.content.artifacts): nevra_to_module.setdefault(artifact, set()).add(dc) modulemd_list.append(dc) modulemd_default_names = parse_defaults(modulemd_index) modulemd_defaults_pb.total = len(modulemd_default_names) modulemd_defaults_pb.state = 'running' modulemd_defaults_pb.save() for default in modulemd_default_names: artifact = default.pop('artifact') relative_path = '{}{}snippet'.format( default[PULP_MODULEDEFAULTS_ATTR.MODULE], default[PULP_MODULEDEFAULTS_ATTR.STREAM]) da = DeclarativeArtifact(artifact=artifact, relative_path=relative_path, url=modules_url) default_content = ModulemdDefaults(**default) modulemd_defaults_pb.increment() dc = DeclarativeContent(content=default_content, d_artifacts=[da]) await self.put(dc) if comps_downloader: comps_result = await comps_downloader.run() comps = libcomps.Comps() comps.fromxml_f(comps_result.path) comps_pb.total = (len(comps.groups) + len(comps.categories) + len(comps.environments)) comps_pb.state = 'running' comps_pb.save() if comps.langpacks: langpack_dict = PackageLangpacks.libcomps_to_dict( comps.langpacks) packagelangpack = PackageLangpacks( matches=strdict_to_dict(comps.langpacks), digest=dict_digest(langpack_dict)) dc = DeclarativeContent(content=packagelangpack) dc.extra_data = defaultdict(list) await self.put(dc) if comps.categories: for category in comps.categories: category_dict = PackageCategory.libcomps_to_dict( category) category_dict['digest'] = dict_digest(category_dict) packagecategory = PackageCategory(**category_dict) dc = DeclarativeContent(content=packagecategory) dc.extra_data = defaultdict(list) if packagecategory.group_ids: for group_id in packagecategory.group_ids: group_to_categories[group_id['name']].append( dc) dc_categories.append(dc) if comps.environments: for environment in comps.environments: environment_dict = PackageEnvironment.libcomps_to_dict( environment) environment_dict['digest'] = dict_digest( environment_dict) packageenvironment = PackageEnvironment( **environment_dict) dc = DeclarativeContent(content=packageenvironment) dc.extra_data = defaultdict(list) if packageenvironment.option_ids: for option_id in packageenvironment.option_ids: optionalgroup_to_environments[ option_id['name']].append(dc) if packageenvironment.group_ids: for group_id in packageenvironment.group_ids: group_to_environments[group_id['name']].append( dc) dc_environments.append(dc) if comps.groups: for group in comps.groups: group_dict = PackageGroup.libcomps_to_dict(group) group_dict['digest'] = dict_digest(group_dict) packagegroup = PackageGroup(**group_dict) dc = DeclarativeContent(content=packagegroup) dc.extra_data = defaultdict(list) if packagegroup.packages: for package in packagegroup.packages: pkgname_to_groups[package['name']].append(dc) if dc.content.id in group_to_categories.keys(): for dc_category in group_to_categories[ dc.content.id]: dc.extra_data['category_relations'].append( dc_category) dc_category.extra_data['packagegroups'].append( dc) if dc.content.id in group_to_environments.keys(): for dc_environment in group_to_environments[ dc.content.id]: dc.extra_data['environment_relations'].append( dc_environment) dc_environment.extra_data[ 'packagegroups'].append(dc) if dc.content.id in optionalgroup_to_environments.keys( ): for dc_environment in optionalgroup_to_environments[ dc.content.id]: dc.extra_data['env_relations_optional'].append( dc_environment) dc_environment.extra_data[ 'optionalgroups'].append(dc) dc_groups.append(dc) for dc_category in dc_categories: comps_pb.increment() await self.put(dc_category) for dc_environment in dc_environments: comps_pb.increment() await self.put(dc_environment) # to preserve order, downloaders are created after all repodata urls are identified package_repodata_downloaders = [] for repodata_type in PACKAGE_REPODATA: downloader = self.remote.get_downloader( url=package_repodata_urls[repodata_type]) package_repodata_downloaders.append(downloader.run()) downloaders.append(package_repodata_downloaders) # asyncio.gather is used to preserve the order of results for package repodata pending = [ asyncio.gather(*downloaders_group) for downloaders_group in downloaders ] while pending: done, pending = await asyncio.wait( pending, return_when=asyncio.FIRST_COMPLETED) for downloader in done: results = downloader.result() if results[0].url == package_repodata_urls['primary']: primary_xml_path = results[0].path filelists_xml_path = results[1].path other_xml_path = results[2].path metadata_pb.done += 3 metadata_pb.save() packages = await RpmFirstStage.parse_repodata( primary_xml_path, filelists_xml_path, other_xml_path) packages_pb.total = len(packages) packages_pb.state = 'running' packages_pb.save() for pkg in packages.values(): package = Package( **Package.createrepo_to_dict(pkg)) artifact = Artifact(size=package.size_package) checksum_type = getattr( CHECKSUM_TYPES, package.checksum_type.upper()) setattr(artifact, checksum_type, package.pkgId) url = urljoin(remote_url, package.location_href) filename = os.path.basename(package.location_href) da = DeclarativeArtifact( artifact=artifact, url=url, relative_path=filename, remote=self.remote, deferred_download=self.deferred_download) dc = DeclarativeContent(content=package, d_artifacts=[da]) dc.extra_data = defaultdict(list) # find if a package relates to a modulemd if dc.content.nevra in nevra_to_module.keys(): dc.content.is_modular = True for dc_modulemd in nevra_to_module[ dc.content.nevra]: dc.extra_data['modulemd_relation'].append( dc_modulemd) dc_modulemd.extra_data[ 'package_relation'].append(dc) if dc.content.name in pkgname_to_groups.keys(): for dc_group in pkgname_to_groups[ dc.content.name]: dc.extra_data['group_relations'].append( dc_group) dc_group.extra_data[ 'related_packages'].append(dc) packages_pb.increment() await self.put(dc) elif results[0].url == updateinfo_url: updateinfo_xml_path = results[0].path metadata_pb.increment() updates = await RpmFirstStage.parse_updateinfo( updateinfo_xml_path) errata_pb.total = len(updates) errata_pb.state = 'running' errata_pb.save() for update in updates: update_record = UpdateRecord( **UpdateRecord.createrepo_to_dict(update)) update_record.digest = RpmFirstStage.hash_update_record( update) future_relations = { 'collections': defaultdict(list), 'references': [] } for collection in update.collections: coll_dict = UpdateCollection.createrepo_to_dict( collection) coll = UpdateCollection(**coll_dict) for package in collection.packages: pkg_dict = UpdateCollectionPackage.createrepo_to_dict( package) pkg = UpdateCollectionPackage(**pkg_dict) future_relations['collections'][ coll].append(pkg) for reference in update.references: reference_dict = UpdateReference.createrepo_to_dict( reference) ref = UpdateReference(**reference_dict) future_relations['references'].append(ref) errata_pb.increment() dc = DeclarativeContent(content=update_record) dc.extra_data = future_relations await self.put(dc) # now send modules down the pipeline since all relations have been set up for modulemd in modulemd_list: modulemd_pb.increment() await self.put(modulemd) for dc_group in dc_groups: comps_pb.increment() await self.put(dc_group) packages_pb.state = 'completed' errata_pb.state = 'completed' modulemd_pb.state = 'completed' modulemd_defaults_pb.state = 'completed' comps_pb.state = 'completed' packages_pb.save() errata_pb.save() modulemd_pb.save() modulemd_defaults_pb.save() comps_pb.save()
def find_children_of_content(content, repository_version): """Finds the content referenced directly by other content and returns it all together. Finds RPMs referenced by Advisory/Errata content. Args: content (iterable): Content for which to resolve children repository_version (pulpcore.models.RepositoryVersion): Source repo version Returns: Queryset of Content objects that are children of the intial set of content """ # Advisories that were selected to be copied advisory_ids = content.filter( pulp_type=UpdateRecord.get_pulp_type()).only('pk') # All packages in the source repository version package_ids = repository_version.content.filter( pulp_type=Package.get_pulp_type()).only('pk') # All modules in the source repository version module_ids = repository_version.content.filter( pulp_type=Modulemd.get_pulp_type()).only('pk') advisories = UpdateRecord.objects.filter(pk__in=advisory_ids) packages = Package.objects.filter(pk__in=package_ids) modules = Modulemd.objects.filter(pk__in=module_ids) children = set() for advisory in advisories: # Find rpms referenced by Advisories/Errata package_nevras = advisory.get_pkglist() for nevra in package_nevras: (name, epoch, version, release, arch) = nevra try: package = packages.get(name=name, epoch=epoch, version=version, release=release, arch=arch) children.add(package.pk) except Package.DoesNotExist: raise except MultipleObjectsReturned: raise module_nsvcas = advisory.get_module_list() for nsvca in module_nsvcas: (name, stream, version, context, arch) = nsvca try: module = modules.get(name=name, stream=stream, version=version, context=context, arch=arch) children.add(module.pk) except Modulemd.DoesNotExist: raise except MultipleObjectsReturned: raise # TODO: Find rpms referenced by PackageGroups, # PackageGroups referenced by PackageCategories, etc. return Content.objects.filter(pk__in=children)
async def __call__(self, in_q, out_q): """ Build `DeclarativeContent` from the repodata. Args: in_q (asyncio.Queue): Unused because the first stage doesn't read from an input queue. out_q (asyncio.Queue): The out_q to send `DeclarativeContent` objects to """ with ProgressBar(message='Downloading and Parsing Metadata') as pb: downloader = self.remote.get_downloader( urljoin(self.remote.url, 'repodata/repomd.xml')) # TODO: decide how to distinguish between a mirror list and a normal repo result = await downloader.run() pb.increment() repomd_path = result.path repomd = cr.Repomd(repomd_path) package_repodata_urls = {} downloaders = [] for record in repomd.records: if record.type in PACKAGE_REPODATA: package_repodata_urls[record.type] = urljoin( self.remote.url, record.location_href) elif record.type in UPDATE_REPODATA: updateinfo_url = urljoin(self.remote.url, record.location_href) downloader = self.remote.get_downloader(updateinfo_url) downloaders.append([downloader.run()]) else: log.info( _('Unknown repodata type: {t}. Skipped.').format( t=record.type)) # TODO: skip databases, save unknown types to publish them as-is # to preserve order, downloaders are created after all repodata urls are identified package_repodata_downloaders = [] for repodata_type in PACKAGE_REPODATA: downloader = self.remote.get_downloader( package_repodata_urls[repodata_type]) package_repodata_downloaders.append(downloader.run()) downloaders.append(package_repodata_downloaders) # asyncio.gather is used to preserve the order of results for package repodata pending = [ asyncio.gather(*downloaders_group) for downloaders_group in downloaders ] while pending: done, pending = await asyncio.wait( pending, return_when=asyncio.FIRST_COMPLETED) for downloader in done: results = downloader.result() if results[0].url == package_repodata_urls['primary']: primary_xml_path = results[0].path filelists_xml_path = results[1].path other_xml_path = results[2].path pb.done += 3 pb.save() packages = await RpmFirstStage.parse_repodata( primary_xml_path, filelists_xml_path, other_xml_path) for pkg in packages.values(): package = Package( **Package.createrepo_to_dict(pkg)) artifact = Artifact(size=package.size_package) checksum_type = getattr( CHECKSUM_TYPES, package.checksum_type.upper()) setattr(artifact, checksum_type, package.pkgId) url = urljoin(self.remote.url, package.location_href) da = DeclarativeArtifact(artifact, url, package.location_href, self.remote) dc = DeclarativeContent(content=package, d_artifacts=[da]) await out_q.put(dc) elif results[0].url == updateinfo_url: updateinfo_xml_path = results[0].path pb.increment() updates = await RpmFirstStage.parse_updateinfo( updateinfo_xml_path) for update in updates: update_record = UpdateRecord( **UpdateRecord.createrepo_to_dict(update)) update_record.digest = RpmFirstStage.hash_update_record( update) for collection in update.collections: coll_dict = UpdateCollection.createrepo_to_dict( collection) coll = UpdateCollection(**coll_dict) for package in collection.packages: pkg_dict = UpdateCollectionPackage.createrepo_to_dict( package) pkg = UpdateCollectionPackage(**pkg_dict) coll._packages.append(pkg) update_record._collections.append(coll) for reference in update.references: reference_dict = UpdateReference.createrepo_to_dict( reference) update_record._references.append( UpdateReference(**reference_dict)) dc = DeclarativeContent(content=update_record) await out_q.put(dc) await out_q.put(None)