def _resolve_distribution_trees(self, new_version, previous_version): """ There can be only one distribution tree in a repo version. Args: version (pulpcore.app.models.RepositoryVersion): current incomplete repository version previous_version (pulpcore.app.models.RepositoryVersion): a version preceding the current incomplete one """ disttree_pulp_type = DistributionTree.get_pulp_type() current_disttrees = new_version.content.filter( pulp_type=disttree_pulp_type) if len(current_disttrees) < 2: return if previous_version: previous_disttree = previous_version.content.get( pulp_type=disttree_pulp_type) new_version.remove_content( Content.objects.filter(pk=previous_disttree.pk)) incoming_disttrees = new_version.content.filter( pulp_type=disttree_pulp_type) if len(incoming_disttrees) != 1: raise DistributionTreeConflict( _("More than one distribution tree cannot be added to a " "repository version."))
async def parse_distribution_tree(self): """Parse content from the file treeinfo if present.""" if self.treeinfo: d_artifacts = [ DeclarativeArtifact( artifact=Artifact(), url=urljoin(self.data.remote_url, self.treeinfo["filename"]), relative_path=".treeinfo", remote=self.remote, deferred_download=False, ) ] for path, checksum in self.treeinfo["download"]["images"].items(): artifact = Artifact(**checksum) da = DeclarativeArtifact( artifact=artifact, url=urljoin(self.data.remote_url, path), relative_path=path, remote=self.remote, deferred_download=self.deferred_download, ) d_artifacts.append(da) distribution_tree = DistributionTree( **self.treeinfo["distribution_tree"]) dc = DeclarativeContent(content=distribution_tree, d_artifacts=d_artifacts) dc.extra_data = self.treeinfo await self.put(dc)
def create_pulp3_content(self): """ Create a Pulp 3 Distribution content for saving later in a bulk operation. """ namespaces = [".treeinfo", "treeinfo"] for namespace in namespaces: treeinfo = PulpTreeInfo() try: treeinfo.load(f=os.path.join( self.pulp2content.pulp2_storage_path, namespace)) except FileNotFoundError: continue self.filename = namespace treeinfo_parsed = treeinfo.parsed_sections() treeinfo_serialized = TreeinfoData(treeinfo_parsed).to_dict( filename=namespace) # Pulp 2 only knows about the top level kickstart repository treeinfo_serialized["repositories"] = {'.': None} # Pulp 2 did not support addon repositories, so we should not list them here either treeinfo_serialized['addons'] = {} # Pulp 2 only supported variants that are in the root of the repository variants = {} for name, variant in treeinfo_serialized['variants'].items(): if variant['repository'] == '.': variants[name] = variant treeinfo_serialized['variants'] = variants # Reset build_timestamp so Pulp will fetch all the addons during the next sync treeinfo_serialized['distribution_tree']['build_timestamp'] = 0 return (DistributionTree( **treeinfo_serialized["distribution_tree"]), treeinfo_serialized)
def set_up_queryset(self): """ Set up a queryset for RepositoryVersion distribution tree repos. Returns: django.db.models.QuerySet: The subrepos contained in the repo version """ try: tree = self.repo_version.content.get(pulp_type=DistributionTree.get_pulp_type()) except Content.DoesNotExist: return [] else: return tree.cast().repositories()
def create_pulp3_content(self): """ Create a Pulp 3 Distribution content for saving later in a bulk operation. Returns: (DistributionTree, dict): an in-memory DistributionTree content and serialized treeinfo data. Returns (None, None) if no treeinfo can be found. """ treeinfo_serialized = self.get_treeinfo_serialized() if treeinfo_serialized is None: return None, None else: return (DistributionTree(**treeinfo_serialized["distribution_tree"]), treeinfo_serialized)
async def run(self): """ Build `DeclarativeContent` from the repodata. """ remote_url = self.new_url or self.remote.url remote_url = remote_url if remote_url[-1] == "/" else f"{remote_url}/" optimize_sync = self.optimize progress_data = dict(message='Downloading Metadata Files', code='downloading.metadata') with ProgressReport(**progress_data) as metadata_pb: downloader = self.remote.get_downloader( url=urljoin(remote_url, 'repodata/repomd.xml')) # TODO: decide how to distinguish between a mirror list and a normal repo result = await downloader.run() metadata_pb.increment() repomd_path = result.path repomd = cr.Repomd(repomd_path) # Caution: we are not storing when the remote was last updated, so the order of this # logic must remain in this order where we first check the version number as other # changes than sync could have taken place such that the date or repo version will be # different from last sync if (optimize_sync and self.repository.last_sync_remote and self.remote.pk == self.repository.last_sync_remote.pk and (self.repository.last_sync_repo_version == self.repository.latest_version().number) and (self.remote.pulp_last_updated <= self.repository.latest_version().pulp_created) and is_previous_version( repomd.revision, self.repository.last_sync_revision_number)): optimize_data = dict(message='Optimizing Sync', code='optimizing.sync') with ProgressReport(**optimize_data) as optimize_pb: optimize_pb.done = 1 optimize_pb.save() return self.repository.last_sync_revision_number = repomd.revision if self.treeinfo: d_artifacts = [ DeclarativeArtifact( artifact=Artifact(), url=urljoin(remote_url, self.treeinfo["filename"]), relative_path=".treeinfo", remote=self.remote, deferred_download=False, ) ] for path, checksum in self.treeinfo["download"][ "images"].items(): artifact = Artifact(**checksum) da = DeclarativeArtifact( artifact=artifact, url=urljoin(remote_url, path), relative_path=path, remote=self.remote, deferred_download=self.deferred_download) d_artifacts.append(da) distribution_tree = DistributionTree( **self.treeinfo["distribution_tree"]) dc = DeclarativeContent(content=distribution_tree, d_artifacts=d_artifacts) dc.extra_data = self.treeinfo await self.put(dc) package_repodata_urls = {} downloaders = [] modulemd_list = list() dc_groups = [] dc_categories = [] dc_environments = [] nevra_to_module = defaultdict(dict) pkgname_to_groups = defaultdict(list) group_to_categories = defaultdict(list) group_to_environments = defaultdict(list) optionalgroup_to_environments = defaultdict(list) modulemd_results = None comps_downloader = None main_types = set() checksums = {} for record in repomd.records: checksums[record.type] = record.checksum_type.upper() if record.type in PACKAGE_REPODATA: main_types.update([record.type]) package_repodata_urls[record.type] = urljoin( remote_url, record.location_href) elif record.type in UPDATE_REPODATA: updateinfo_url = urljoin(remote_url, record.location_href) downloader = self.remote.get_downloader(url=updateinfo_url) downloaders.append([downloader.run()]) elif record.type in COMPS_REPODATA: comps_url = urljoin(remote_url, record.location_href) comps_downloader = self.remote.get_downloader( url=comps_url) elif record.type in SKIP_REPODATA: continue elif '_zck' in record.type: continue elif record.type in MODULAR_REPODATA: modules_url = urljoin(remote_url, record.location_href) modulemd_downloader = self.remote.get_downloader( url=modules_url) modulemd_results = await modulemd_downloader.run() elif record.type not in PACKAGE_DB_REPODATA: file_data = { record.checksum_type: record.checksum, "size": record.size } da = DeclarativeArtifact( artifact=Artifact(**file_data), url=urljoin(remote_url, record.location_href), relative_path=record.location_href, remote=self.remote, deferred_download=False) repo_metadata_file = RepoMetadataFile( data_type=record.type, checksum_type=record.checksum_type, checksum=record.checksum, ) dc = DeclarativeContent(content=repo_metadata_file, d_artifacts=[da]) await self.put(dc) missing_type = set(PACKAGE_REPODATA) - main_types if missing_type: raise FileNotFoundError( _("XML file(s): {filename} not found").format( filename=", ".join(missing_type))) self.repository.original_checksum_types = checksums # we have to sync module.yaml first if it exists, to make relations to packages if modulemd_results: modulemd_index = mmdlib.ModuleIndex.new() open_func = gzip.open if modulemd_results.url.endswith( '.gz') else open with open_func(modulemd_results.path, 'r') as moduleyaml: content = moduleyaml.read() module_content = content if isinstance( content, str) else content.decode() modulemd_index.update_from_string(module_content, True) modulemd_names = modulemd_index.get_module_names() or [] modulemd_all = parse_modulemd(modulemd_names, modulemd_index) # Parsing modules happens all at one time, and from here on no useful work happens. # So just report that it finished this stage. modulemd_pb_data = { 'message': 'Parsed Modulemd', 'code': 'parsing.modulemds' } with ProgressReport(**modulemd_pb_data) as modulemd_pb: modulemd_total = len(modulemd_all) modulemd_pb.total = modulemd_total modulemd_pb.done = modulemd_total for modulemd in modulemd_all: artifact = modulemd.pop('artifact') relative_path = '{}{}{}{}{}snippet'.format( modulemd[PULP_MODULE_ATTR.NAME], modulemd[PULP_MODULE_ATTR.STREAM], modulemd[PULP_MODULE_ATTR.VERSION], modulemd[PULP_MODULE_ATTR.CONTEXT], modulemd[PULP_MODULE_ATTR.ARCH]) da = DeclarativeArtifact(artifact=artifact, relative_path=relative_path, url=modules_url) modulemd_content = Modulemd(**modulemd) dc = DeclarativeContent(content=modulemd_content, d_artifacts=[da]) dc.extra_data = defaultdict(list) # dc.content.artifacts are Modulemd artifacts for artifact in dc.content.artifacts: nevra_to_module.setdefault(artifact, set()).add(dc) modulemd_list.append(dc) # delete list now that we're done with it for memory savings del modulemd_all modulemd_default_names = parse_defaults(modulemd_index) # Parsing module-defaults happens all at one time, and from here on no useful # work happens. So just report that it finished this stage. modulemd_defaults_pb_data = { 'message': 'Parsed Modulemd-defaults', 'code': 'parsing.modulemd_defaults' } with ProgressReport( **modulemd_defaults_pb_data) as modulemd_defaults_pb: modulemd_defaults_total = len(modulemd_default_names) modulemd_defaults_pb.total = modulemd_defaults_total modulemd_defaults_pb.done = modulemd_defaults_total for default in modulemd_default_names: artifact = default.pop('artifact') relative_path = '{}{}snippet'.format( default[PULP_MODULEDEFAULTS_ATTR.MODULE], default[PULP_MODULEDEFAULTS_ATTR.STREAM]) da = DeclarativeArtifact(artifact=artifact, relative_path=relative_path, url=modules_url) default_content = ModulemdDefaults(**default) dc = DeclarativeContent(content=default_content, d_artifacts=[da]) await self.put(dc) # delete list now that we're done with it for memory savings del modulemd_default_names if comps_downloader: comps_result = await comps_downloader.run() comps = libcomps.Comps() comps.fromxml_f(comps_result.path) with ProgressReport(message='Parsed Comps', code='parsing.comps') as comps_pb: comps_total = (len(comps.groups) + len(comps.categories) + len(comps.environments)) comps_pb.total = comps_total comps_pb.done = comps_total if comps.langpacks: langpack_dict = PackageLangpacks.libcomps_to_dict( comps.langpacks) packagelangpack = PackageLangpacks( matches=strdict_to_dict(comps.langpacks), digest=dict_digest(langpack_dict)) dc = DeclarativeContent(content=packagelangpack) dc.extra_data = defaultdict(list) await self.put(dc) if comps.categories: for category in comps.categories: category_dict = PackageCategory.libcomps_to_dict( category) category_dict['digest'] = dict_digest(category_dict) packagecategory = PackageCategory(**category_dict) dc = DeclarativeContent(content=packagecategory) dc.extra_data = defaultdict(list) if packagecategory.group_ids: for group_id in packagecategory.group_ids: group_to_categories[group_id['name']].append( dc) dc_categories.append(dc) if comps.environments: for environment in comps.environments: environment_dict = PackageEnvironment.libcomps_to_dict( environment) environment_dict['digest'] = dict_digest( environment_dict) packageenvironment = PackageEnvironment( **environment_dict) dc = DeclarativeContent(content=packageenvironment) dc.extra_data = defaultdict(list) if packageenvironment.option_ids: for option_id in packageenvironment.option_ids: optionalgroup_to_environments[ option_id['name']].append(dc) if packageenvironment.group_ids: for group_id in packageenvironment.group_ids: group_to_environments[group_id['name']].append( dc) dc_environments.append(dc) if comps.groups: for group in comps.groups: group_dict = PackageGroup.libcomps_to_dict(group) group_dict['digest'] = dict_digest(group_dict) packagegroup = PackageGroup(**group_dict) dc = DeclarativeContent(content=packagegroup) dc.extra_data = defaultdict(list) if packagegroup.packages: for package in packagegroup.packages: pkgname_to_groups[package['name']].append(dc) if dc.content.id in group_to_categories.keys(): for dc_category in group_to_categories[ dc.content.id]: dc.extra_data['category_relations'].append( dc_category) dc_category.extra_data['packagegroups'].append( dc) if dc.content.id in group_to_environments.keys(): for dc_environment in group_to_environments[ dc.content.id]: dc.extra_data['environment_relations'].append( dc_environment) dc_environment.extra_data[ 'packagegroups'].append(dc) if dc.content.id in optionalgroup_to_environments.keys( ): for dc_environment in optionalgroup_to_environments[ dc.content.id]: dc.extra_data['env_relations_optional'].append( dc_environment) dc_environment.extra_data[ 'optionalgroups'].append(dc) dc_groups.append(dc) for dc_category in dc_categories: await self.put(dc_category) for dc_environment in dc_environments: await self.put(dc_environment) # delete lists now that we're done with them for memory savings del dc_environments del dc_categories # to preserve order, downloaders are created after all repodata urls are identified package_repodata_downloaders = [] for repodata_type in PACKAGE_REPODATA: downloader = self.remote.get_downloader( url=package_repodata_urls[repodata_type]) package_repodata_downloaders.append(downloader.run()) downloaders.append(package_repodata_downloaders) # asyncio.gather is used to preserve the order of results for package repodata pending = [ asyncio.gather(*downloaders_group) for downloaders_group in downloaders ] while pending: done, pending = await asyncio.wait( pending, return_when=asyncio.FIRST_COMPLETED) for downloader in done: try: results = downloader.result() except ClientResponseError as exc: raise HTTPNotFound( reason=_("File not found: {filename}").format( filename=exc.request_info.url)) if results[0].url == package_repodata_urls['primary']: primary_xml_path = results[0].path filelists_xml_path = results[1].path other_xml_path = results[2].path metadata_pb.done += 3 metadata_pb.save() packages = await RpmFirstStage.parse_repodata( primary_xml_path, filelists_xml_path, other_xml_path) # skip SRPM if defined if 'srpm' in self.skip_types: packages = { pkgId: pkg for pkgId, pkg in packages.items() if pkg.arch != 'src' } progress_data = { 'message': 'Parsed Packages', 'code': 'parsing.packages', 'total': len(packages), } with ProgressReport(**progress_data) as packages_pb: for pkg in packages.values(): package = Package( **Package.createrepo_to_dict(pkg)) artifact = Artifact(size=package.size_package) checksum_type = getattr( CHECKSUM_TYPES, package.checksum_type.upper()) setattr(artifact, checksum_type, package.pkgId) url = urljoin(remote_url, package.location_href) filename = os.path.basename( package.location_href) da = DeclarativeArtifact( artifact=artifact, url=url, relative_path=filename, remote=self.remote, deferred_download=self.deferred_download) dc = DeclarativeContent(content=package, d_artifacts=[da]) dc.extra_data = defaultdict(list) # find if a package relates to a modulemd if dc.content.nevra in nevra_to_module.keys(): dc.content.is_modular = True for dc_modulemd in nevra_to_module[ dc.content.nevra]: dc.extra_data[ 'modulemd_relation'].append( dc_modulemd) dc_modulemd.extra_data[ 'package_relation'].append(dc) if dc.content.name in pkgname_to_groups.keys(): for dc_group in pkgname_to_groups[ dc.content.name]: dc.extra_data[ 'group_relations'].append(dc_group) dc_group.extra_data[ 'related_packages'].append(dc) packages_pb.increment() await self.put(dc) elif results[0].url == updateinfo_url: updateinfo_xml_path = results[0].path metadata_pb.increment() updates = await RpmFirstStage.parse_updateinfo( updateinfo_xml_path) progress_data = { 'message': 'Parsed Advisories', 'code': 'parsing.advisories', 'total': len(updates), } with ProgressReport(**progress_data) as advisories_pb: for update in updates: update_record = UpdateRecord( **UpdateRecord.createrepo_to_dict(update)) update_record.digest = hash_update_record( update) future_relations = { 'collections': defaultdict(list), 'references': [] } for collection in update.collections: coll_dict = UpdateCollection.createrepo_to_dict( collection) coll = UpdateCollection(**coll_dict) for package in collection.packages: pkg_dict = UpdateCollectionPackage.createrepo_to_dict( package) pkg = UpdateCollectionPackage( **pkg_dict) future_relations['collections'][ coll].append(pkg) for reference in update.references: reference_dict = UpdateReference.createrepo_to_dict( reference) ref = UpdateReference(**reference_dict) future_relations['references'].append(ref) advisories_pb.increment() dc = DeclarativeContent(content=update_record) dc.extra_data = future_relations await self.put(dc) # now send modules down the pipeline since all relations have been set up for modulemd in modulemd_list: await self.put(modulemd) for dc_group in dc_groups: await self.put(dc_group)
async def run(self): """ Build `DeclarativeContent` from the repodata. """ packages_pb = ProgressReport(message='Parsed Packages', code='parsing.packages') errata_pb = ProgressReport(message='Parsed Erratum', code='parsing.errata') modulemd_pb = ProgressReport(message='Parse Modulemd', code='parsing.modulemds') modulemd_defaults_pb = ProgressReport( message='Parse Modulemd-defaults', code='parsing.modulemddefaults') comps_pb = ProgressReport(message='Parsed Comps', code='parsing.comps') packages_pb.save() errata_pb.save() comps_pb.save() remote_url = self.new_url or self.remote.url remote_url = remote_url if remote_url[-1] == "/" else f"{remote_url}/" progress_data = dict(message='Downloading Metadata Files', code='downloading.metadata') with ProgressReport(**progress_data) as metadata_pb: downloader = self.remote.get_downloader( url=urljoin(remote_url, 'repodata/repomd.xml')) # TODO: decide how to distinguish between a mirror list and a normal repo result = await downloader.run() metadata_pb.increment() if self.kickstart: d_artifacts = [] for path, checksum in self.kickstart["download"][ "images"].items(): artifact = Artifact(**checksum) da = DeclarativeArtifact( artifact=artifact, url=urljoin(remote_url, path), relative_path=path, remote=self.remote, deferred_download=self.deferred_download) d_artifacts.append(da) distribution_tree = DistributionTree( **self.kickstart["distribution_tree"]) dc = DeclarativeContent(content=distribution_tree, d_artifacts=d_artifacts) dc.extra_data = self.kickstart await self.put(dc) repomd_path = result.path repomd = cr.Repomd(repomd_path) package_repodata_urls = {} downloaders = [] modulemd_list = list() dc_groups = [] dc_categories = [] dc_environments = [] nevra_to_module = defaultdict(dict) pkgname_to_groups = defaultdict(list) group_to_categories = defaultdict(list) group_to_environments = defaultdict(list) optionalgroup_to_environments = defaultdict(list) modulemd_results = None comps_downloader = None for record in repomd.records: if record.type in PACKAGE_REPODATA: package_repodata_urls[record.type] = urljoin( remote_url, record.location_href) elif record.type in UPDATE_REPODATA: updateinfo_url = urljoin(remote_url, record.location_href) downloader = self.remote.get_downloader(url=updateinfo_url) downloaders.append([downloader.run()]) elif record.type in COMPS_REPODATA: comps_url = urljoin(remote_url, record.location_href) comps_downloader = self.remote.get_downloader( url=comps_url) elif record.type in SKIP_REPODATA: continue elif record.type in MODULAR_REPODATA: modules_url = urljoin(remote_url, record.location_href) modulemd_downloader = self.remote.get_downloader( url=modules_url) modulemd_results = await modulemd_downloader.run() elif record.type not in PACKAGE_DB_REPODATA: file_data = { record.checksum_type: record.checksum, "size": record.size } da = DeclarativeArtifact( artifact=Artifact(**file_data), url=urljoin(remote_url, record.location_href), relative_path=record.location_href, remote=self.remote, deferred_download=False) repo_metadata_file = RepoMetadataFile( data_type=record.type, checksum_type=record.checksum_type, checksum=record.checksum, ) dc = DeclarativeContent(content=repo_metadata_file, d_artifacts=[da]) await self.put(dc) # we have to sync module.yaml first if it exists, to make relations to packages if modulemd_results: modulemd_index = mmdlib.ModuleIndex.new() open_func = gzip.open if modulemd_results.url.endswith( '.gz') else open with open_func(modulemd_results.path, 'r') as moduleyaml: modulemd_index.update_from_string( moduleyaml.read().decode(), True) modulemd_names = modulemd_index.get_module_names() or [] modulemd_all = parse_modulemd(modulemd_names, modulemd_index) modulemd_pb.total = len(modulemd_all) modulemd_pb.state = 'running' modulemd_pb.save() for modulemd in modulemd_all: artifact = modulemd.pop('artifact') relative_path = '{}{}{}{}{}snippet'.format( modulemd[PULP_MODULE_ATTR.NAME], modulemd[PULP_MODULE_ATTR.STREAM], modulemd[PULP_MODULE_ATTR.VERSION], modulemd[PULP_MODULE_ATTR.CONTEXT], modulemd[PULP_MODULE_ATTR.ARCH]) da = DeclarativeArtifact(artifact=artifact, relative_path=relative_path, url=modules_url) modulemd_content = Modulemd(**modulemd) dc = DeclarativeContent(content=modulemd_content, d_artifacts=[da]) dc.extra_data = defaultdict(list) # dc.content.artifacts are Modulemd artifacts for artifact in json.loads(dc.content.artifacts): nevra_to_module.setdefault(artifact, set()).add(dc) modulemd_list.append(dc) modulemd_default_names = parse_defaults(modulemd_index) modulemd_defaults_pb.total = len(modulemd_default_names) modulemd_defaults_pb.state = 'running' modulemd_defaults_pb.save() for default in modulemd_default_names: artifact = default.pop('artifact') relative_path = '{}{}snippet'.format( default[PULP_MODULEDEFAULTS_ATTR.MODULE], default[PULP_MODULEDEFAULTS_ATTR.STREAM]) da = DeclarativeArtifact(artifact=artifact, relative_path=relative_path, url=modules_url) default_content = ModulemdDefaults(**default) modulemd_defaults_pb.increment() dc = DeclarativeContent(content=default_content, d_artifacts=[da]) await self.put(dc) if comps_downloader: comps_result = await comps_downloader.run() comps = libcomps.Comps() comps.fromxml_f(comps_result.path) comps_pb.total = (len(comps.groups) + len(comps.categories) + len(comps.environments)) comps_pb.state = 'running' comps_pb.save() if comps.langpacks: langpack_dict = PackageLangpacks.libcomps_to_dict( comps.langpacks) packagelangpack = PackageLangpacks( matches=strdict_to_dict(comps.langpacks), digest=dict_digest(langpack_dict)) dc = DeclarativeContent(content=packagelangpack) dc.extra_data = defaultdict(list) await self.put(dc) if comps.categories: for category in comps.categories: category_dict = PackageCategory.libcomps_to_dict( category) category_dict['digest'] = dict_digest(category_dict) packagecategory = PackageCategory(**category_dict) dc = DeclarativeContent(content=packagecategory) dc.extra_data = defaultdict(list) if packagecategory.group_ids: for group_id in packagecategory.group_ids: group_to_categories[group_id['name']].append( dc) dc_categories.append(dc) if comps.environments: for environment in comps.environments: environment_dict = PackageEnvironment.libcomps_to_dict( environment) environment_dict['digest'] = dict_digest( environment_dict) packageenvironment = PackageEnvironment( **environment_dict) dc = DeclarativeContent(content=packageenvironment) dc.extra_data = defaultdict(list) if packageenvironment.option_ids: for option_id in packageenvironment.option_ids: optionalgroup_to_environments[ option_id['name']].append(dc) if packageenvironment.group_ids: for group_id in packageenvironment.group_ids: group_to_environments[group_id['name']].append( dc) dc_environments.append(dc) if comps.groups: for group in comps.groups: group_dict = PackageGroup.libcomps_to_dict(group) group_dict['digest'] = dict_digest(group_dict) packagegroup = PackageGroup(**group_dict) dc = DeclarativeContent(content=packagegroup) dc.extra_data = defaultdict(list) if packagegroup.packages: for package in packagegroup.packages: pkgname_to_groups[package['name']].append(dc) if dc.content.id in group_to_categories.keys(): for dc_category in group_to_categories[ dc.content.id]: dc.extra_data['category_relations'].append( dc_category) dc_category.extra_data['packagegroups'].append( dc) if dc.content.id in group_to_environments.keys(): for dc_environment in group_to_environments[ dc.content.id]: dc.extra_data['environment_relations'].append( dc_environment) dc_environment.extra_data[ 'packagegroups'].append(dc) if dc.content.id in optionalgroup_to_environments.keys( ): for dc_environment in optionalgroup_to_environments[ dc.content.id]: dc.extra_data['env_relations_optional'].append( dc_environment) dc_environment.extra_data[ 'optionalgroups'].append(dc) dc_groups.append(dc) for dc_category in dc_categories: comps_pb.increment() await self.put(dc_category) for dc_environment in dc_environments: comps_pb.increment() await self.put(dc_environment) # to preserve order, downloaders are created after all repodata urls are identified package_repodata_downloaders = [] for repodata_type in PACKAGE_REPODATA: downloader = self.remote.get_downloader( url=package_repodata_urls[repodata_type]) package_repodata_downloaders.append(downloader.run()) downloaders.append(package_repodata_downloaders) # asyncio.gather is used to preserve the order of results for package repodata pending = [ asyncio.gather(*downloaders_group) for downloaders_group in downloaders ] while pending: done, pending = await asyncio.wait( pending, return_when=asyncio.FIRST_COMPLETED) for downloader in done: results = downloader.result() if results[0].url == package_repodata_urls['primary']: primary_xml_path = results[0].path filelists_xml_path = results[1].path other_xml_path = results[2].path metadata_pb.done += 3 metadata_pb.save() packages = await RpmFirstStage.parse_repodata( primary_xml_path, filelists_xml_path, other_xml_path) packages_pb.total = len(packages) packages_pb.state = 'running' packages_pb.save() for pkg in packages.values(): package = Package( **Package.createrepo_to_dict(pkg)) artifact = Artifact(size=package.size_package) checksum_type = getattr( CHECKSUM_TYPES, package.checksum_type.upper()) setattr(artifact, checksum_type, package.pkgId) url = urljoin(remote_url, package.location_href) filename = os.path.basename(package.location_href) da = DeclarativeArtifact( artifact=artifact, url=url, relative_path=filename, remote=self.remote, deferred_download=self.deferred_download) dc = DeclarativeContent(content=package, d_artifacts=[da]) dc.extra_data = defaultdict(list) # find if a package relates to a modulemd if dc.content.nevra in nevra_to_module.keys(): dc.content.is_modular = True for dc_modulemd in nevra_to_module[ dc.content.nevra]: dc.extra_data['modulemd_relation'].append( dc_modulemd) dc_modulemd.extra_data[ 'package_relation'].append(dc) if dc.content.name in pkgname_to_groups.keys(): for dc_group in pkgname_to_groups[ dc.content.name]: dc.extra_data['group_relations'].append( dc_group) dc_group.extra_data[ 'related_packages'].append(dc) packages_pb.increment() await self.put(dc) elif results[0].url == updateinfo_url: updateinfo_xml_path = results[0].path metadata_pb.increment() updates = await RpmFirstStage.parse_updateinfo( updateinfo_xml_path) errata_pb.total = len(updates) errata_pb.state = 'running' errata_pb.save() for update in updates: update_record = UpdateRecord( **UpdateRecord.createrepo_to_dict(update)) update_record.digest = RpmFirstStage.hash_update_record( update) future_relations = { 'collections': defaultdict(list), 'references': [] } for collection in update.collections: coll_dict = UpdateCollection.createrepo_to_dict( collection) coll = UpdateCollection(**coll_dict) for package in collection.packages: pkg_dict = UpdateCollectionPackage.createrepo_to_dict( package) pkg = UpdateCollectionPackage(**pkg_dict) future_relations['collections'][ coll].append(pkg) for reference in update.references: reference_dict = UpdateReference.createrepo_to_dict( reference) ref = UpdateReference(**reference_dict) future_relations['references'].append(ref) errata_pb.increment() dc = DeclarativeContent(content=update_record) dc.extra_data = future_relations await self.put(dc) # now send modules down the pipeline since all relations have been set up for modulemd in modulemd_list: modulemd_pb.increment() await self.put(modulemd) for dc_group in dc_groups: comps_pb.increment() await self.put(dc_group) packages_pb.state = 'completed' errata_pb.state = 'completed' modulemd_pb.state = 'completed' modulemd_defaults_pb.state = 'completed' comps_pb.state = 'completed' packages_pb.save() errata_pb.save() modulemd_pb.save() modulemd_defaults_pb.save() comps_pb.save()
async def run(self): """ Build `DeclarativeContent` from the repodata. """ packages_pb = ProgressBar(message='Parsed Packages') erratum_pb = ProgressBar(message='Parsed Erratum') packages_pb.save() erratum_pb.save() remote_url = self.new_url or self.remote.url with ProgressBar(message='Downloading Metadata Files') as metadata_pb: downloader = self.remote.get_downloader( url=urljoin(remote_url, 'repodata/repomd.xml')) # TODO: decide how to distinguish between a mirror list and a normal repo result = await downloader.run() metadata_pb.increment() if self.kickstart: d_artifacts = [] for path, checksum in self.kickstart["download"][ "images"].items(): artifact = Artifact(**checksum) da = DeclarativeArtifact( artifact=artifact, url=urljoin(remote_url, path), relative_path=path, remote=self.remote, deferred_download=self.deferred_download) d_artifacts.append(da) distribution_tree = DistributionTree( **self.kickstart["distribution_tree"]) dc = DeclarativeContent(content=distribution_tree, d_artifacts=d_artifacts) dc.extra_data = self.kickstart await self.put(dc) repomd_path = result.path repomd = cr.Repomd(repomd_path) package_repodata_urls = {} downloaders = [] for record in repomd.records: if record.type in PACKAGE_REPODATA: package_repodata_urls[record.type] = urljoin( remote_url, record.location_href) elif record.type in UPDATE_REPODATA: updateinfo_url = urljoin(remote_url, record.location_href) downloader = self.remote.get_downloader(url=updateinfo_url) downloaders.append([downloader.run()]) else: log.info( _('Unknown repodata type: {t}. Skipped.').format( t=record.type)) # TODO: skip databases, save unknown types to publish them as-is # to preserve order, downloaders are created after all repodata urls are identified package_repodata_downloaders = [] for repodata_type in PACKAGE_REPODATA: downloader = self.remote.get_downloader( url=package_repodata_urls[repodata_type]) package_repodata_downloaders.append(downloader.run()) downloaders.append(package_repodata_downloaders) # asyncio.gather is used to preserve the order of results for package repodata pending = [ asyncio.gather(*downloaders_group) for downloaders_group in downloaders ] while pending: done, pending = await asyncio.wait( pending, return_when=asyncio.FIRST_COMPLETED) for downloader in done: results = downloader.result() if results[0].url == package_repodata_urls['primary']: primary_xml_path = results[0].path filelists_xml_path = results[1].path other_xml_path = results[2].path metadata_pb.done += 3 metadata_pb.save() packages = await RpmFirstStage.parse_repodata( primary_xml_path, filelists_xml_path, other_xml_path) packages_pb.total = len(packages) packages_pb.state = 'running' packages_pb.save() for pkg in packages.values(): package = Package( **Package.createrepo_to_dict(pkg)) artifact = Artifact(size=package.size_package) checksum_type = getattr( CHECKSUM_TYPES, package.checksum_type.upper()) setattr(artifact, checksum_type, package.pkgId) url = urljoin(remote_url, package.location_href) filename = os.path.basename(package.location_href) da = DeclarativeArtifact( artifact=artifact, url=url, relative_path=filename, remote=self.remote, deferred_download=self.deferred_download) dc = DeclarativeContent(content=package, d_artifacts=[da]) packages_pb.increment() await self.put(dc) elif results[0].url == updateinfo_url: updateinfo_xml_path = results[0].path metadata_pb.increment() updates = await RpmFirstStage.parse_updateinfo( updateinfo_xml_path) erratum_pb.total = len(updates) erratum_pb.state = 'running' erratum_pb.save() for update in updates: update_record = UpdateRecord( **UpdateRecord.createrepo_to_dict(update)) update_record.digest = RpmFirstStage.hash_update_record( update) future_relations = { 'collections': defaultdict(list), 'references': [] } for collection in update.collections: coll_dict = UpdateCollection.createrepo_to_dict( collection) coll = UpdateCollection(**coll_dict) for package in collection.packages: pkg_dict = UpdateCollectionPackage.createrepo_to_dict( package) pkg = UpdateCollectionPackage(**pkg_dict) future_relations['collections'][ coll].append(pkg) for reference in update.references: reference_dict = UpdateReference.createrepo_to_dict( reference) ref = UpdateReference(**reference_dict) future_relations['references'].append(ref) erratum_pb.increment() dc = DeclarativeContent(content=update_record) dc.extra_data = future_relations await self.put(dc) packages_pb.state = 'completed' erratum_pb.state = 'completed' packages_pb.save() erratum_pb.save()