def _init_dc_groups(self, comps): if comps.groups: for group in comps.groups: group_dict = PackageGroup.libcomps_to_dict(group) group_dict["digest"] = dict_digest(group_dict) packagegroup = PackageGroup(**group_dict) dc = DeclarativeContent(content=packagegroup) dc.extra_data = defaultdict(list) if packagegroup.packages: for package in packagegroup.packages: self.data.pkgname_to_groups[package["name"]].append(dc) if dc.content.id in self.group_to_categories.keys(): for dc_category in self.group_to_categories[dc.content.id]: dc.extra_data["category_relations"].append(dc_category) dc_category.extra_data["packagegroups"].append(dc) if dc.content.id in self.group_to_environments.keys(): for dc_environment in self.group_to_environments[ dc.content.id]: dc.extra_data["environment_relations"].append( dc_environment) dc_environment.extra_data["packagegroups"].append(dc) if dc.content.id in self.optionalgroup_to_environments.keys(): for dc_environment in self.optionalgroup_to_environments[ dc.content.id]: dc.extra_data["env_relations_optional"].append( dc_environment) dc_environment.extra_data["optionalgroups"].append(dc) self.data.dc_groups.append(dc)
async def parse_distribution_tree(self): """Parse content from the file treeinfo if present.""" if self.treeinfo: d_artifacts = [ DeclarativeArtifact( artifact=Artifact(), url=urljoin(self.data.remote_url, self.treeinfo["filename"]), relative_path=".treeinfo", remote=self.remote, deferred_download=False, ) ] for path, checksum in self.treeinfo["download"]["images"].items(): artifact = Artifact(**checksum) da = DeclarativeArtifact( artifact=artifact, url=urljoin(self.data.remote_url, path), relative_path=path, remote=self.remote, deferred_download=self.deferred_download, ) d_artifacts.append(da) distribution_tree = DistributionTree( **self.treeinfo["distribution_tree"]) dc = DeclarativeContent(content=distribution_tree, d_artifacts=d_artifacts) dc.extra_data = self.treeinfo await self.put(dc)
async def migrate_to_pulp3(self, batch, pb=None): """ Docker specific implementation of DeclarativeContent creation for migrating docker content to Pulp 3. Args: batch: A batch of Pulp2Content objects to migrate to Pulp 3 """ for pulp2content in batch: pulp_2to3_detail_content = pulp2content.detail_model pulp3content = pulp_2to3_detail_content.create_pulp3_content() future_relations = {'pulp2content': pulp2content} # store digests for future pulp3 content relations if pulp_2to3_detail_content.type == 'docker_manifest': future_relations['blob_rel'] = pulp_2to3_detail_content.blobs future_relations[ 'config_blob_rel'] = pulp_2to3_detail_content.config_blob if pulp_2to3_detail_content.type == 'docker_manifest_list': future_relations[ 'man_rel'] = pulp_2to3_detail_content.listed_manifests if pulp_2to3_detail_content.type == 'docker_tag': future_relations[ 'tag_rel'] = pulp_2to3_detail_content.tagged_manifest if pulp_2to3_detail_content.type == 'docker_tag': # dc without artifact, will assign arifact in the _pre_save hook dc = DeclarativeContent(content=pulp3content) else: artifact = await self.create_artifact( pulp2content.pulp2_storage_path, pulp_2to3_detail_content.expected_digests, pulp_2to3_detail_content.expected_size) da = DeclarativeArtifact( artifact=artifact, url=NOT_USED, relative_path=pulp_2to3_detail_content. relative_path_for_content_artifact, remote=NOT_USED, deferred_download=False) dc = DeclarativeContent(content=pulp3content, d_artifacts=[da], does_batch=False) dc.extra_data = future_relations await self.put(dc) if pb: pb.increment()
def _init_dc_categories(self, comps): if comps.categories: for category in comps.categories: category_dict = PackageCategory.libcomps_to_dict(category) category_dict["digest"] = dict_digest(category_dict) packagecategory = PackageCategory(**category_dict) dc = DeclarativeContent(content=packagecategory) dc.extra_data = defaultdict(list) if packagecategory.group_ids: for group_id in packagecategory.group_ids: self.group_to_categories[group_id["name"]].append(dc) self.dc_categories.append(dc)
async def _parse_packages(self, packages): progress_data = { "message": "Parsed Packages", "code": "sync.parsing.packages", "total": len(packages), } with ProgressReport(**progress_data) as packages_pb: while True: try: (_, pkg) = packages.popitem(last=False) except KeyError: break package = Package(**Package.createrepo_to_dict(pkg)) del pkg artifact = Artifact(size=package.size_package) checksum_type = getattr(CHECKSUM_TYPES, package.checksum_type.upper()) setattr(artifact, checksum_type, package.pkgId) url = urlpath_sanitize(self.data.remote_url, package.location_href) filename = os.path.basename(package.location_href) da = DeclarativeArtifact( artifact=artifact, url=url, relative_path=filename, remote=self.remote, deferred_download=self.deferred_download, ) dc = DeclarativeContent(content=package, d_artifacts=[da]) dc.extra_data = defaultdict(list) # find if a package relates to a modulemd if dc.content.nevra in self.data.nevra_to_module.keys(): dc.content.is_modular = True for dc_modulemd in self.data.nevra_to_module[ dc.content.nevra]: dc.extra_data["modulemd_relation"].append(dc_modulemd) dc_modulemd.extra_data["package_relation"].append(dc) if dc.content.name in self.data.pkgname_to_groups.keys(): for dc_group in self.data.pkgname_to_groups[ dc.content.name]: dc.extra_data["group_relations"].append(dc_group) dc_group.extra_data["related_packages"].append(dc) packages_pb.increment() await self.put(dc)
def _init_dc_environments(self, comps): if comps.environments: for environment in comps.environments: environment_dict = PackageEnvironment.libcomps_to_dict( environment) environment_dict["digest"] = dict_digest(environment_dict) packageenvironment = PackageEnvironment(**environment_dict) dc = DeclarativeContent(content=packageenvironment) dc.extra_data = defaultdict(list) if packageenvironment.option_ids: for option_id in packageenvironment.option_ids: self.optionalgroup_to_environments[ option_id["name"]].append(dc) if packageenvironment.group_ids: for group_id in packageenvironment.group_ids: self.group_to_environments[group_id["name"]].append(dc) self.dc_environments.append(dc)
async def _parse_packages(self, packages): progress_data = { 'message': 'Parsed Packages', 'code': 'parsing.packages', 'total': len(packages), } with ProgressReport(**progress_data) as packages_pb: for pkg in packages.values(): package = Package(**Package.createrepo_to_dict(pkg)) artifact = Artifact(size=package.size_package) checksum_type = getattr(CHECKSUM_TYPES, package.checksum_type.upper()) setattr(artifact, checksum_type, package.pkgId) url = urljoin(self.data.remote_url, package.location_href) filename = os.path.basename(package.location_href) da = DeclarativeArtifact( artifact=artifact, url=url, relative_path=filename, remote=self.remote, deferred_download=self.deferred_download) dc = DeclarativeContent(content=package, d_artifacts=[da]) dc.extra_data = defaultdict(list) # find if a package relates to a modulemd if dc.content.nevra in self.data.nevra_to_module.keys(): dc.content.is_modular = True for dc_modulemd in self.data.nevra_to_module[ dc.content.nevra]: dc.extra_data['modulemd_relation'].append(dc_modulemd) dc_modulemd.extra_data['package_relation'].append(dc) if dc.content.name in self.data.pkgname_to_groups.keys(): for dc_group in self.data.pkgname_to_groups[ dc.content.name]: dc.extra_data['group_relations'].append(dc_group) dc_group.extra_data['related_packages'].append(dc) packages_pb.increment() await self.put(dc)
def _parse_modulemd_list(self, modulemd_index): modulemd_names = modulemd_index.get_module_names() or [] modulemd_all = parse_modulemd(modulemd_names, modulemd_index) # Parsing modules happens all at one time, and from here on no useful work happens. # So just report that it finished this stage. modulemd_pb_data = { "message": "Parsed Modulemd", "code": "parsing.modulemds" } with ProgressReport(**modulemd_pb_data) as modulemd_pb: modulemd_total = len(modulemd_all) modulemd_pb.total = modulemd_total modulemd_pb.done = modulemd_total for modulemd in modulemd_all: artifact = modulemd.pop("artifact") relative_path = "{}{}{}{}{}snippet".format( modulemd[PULP_MODULE_ATTR.NAME], modulemd[PULP_MODULE_ATTR.STREAM], modulemd[PULP_MODULE_ATTR.VERSION], modulemd[PULP_MODULE_ATTR.CONTEXT], modulemd[PULP_MODULE_ATTR.ARCH], ) da = DeclarativeArtifact(artifact=artifact, relative_path=relative_path, url=self.data.modules_url) modulemd_content = Modulemd(**modulemd) dc = DeclarativeContent(content=modulemd_content, d_artifacts=[da]) dc.extra_data = defaultdict(list) # dc.content.artifacts are Modulemd artifacts for artifact in dc.content.artifacts: self.data.nevra_to_module.setdefault(artifact, set()).add(dc) self.data.modulemd_list.append(dc) # delete list now that we're done with it for memory savings del modulemd_all
async def migrate_to_pulp3(self, batch, pb=None): """ A default implementation of DeclarativeContent creation for migrating content to Pulp 3. Plugin writers might want to override this method if it doesn't satisfy their needs as is. Args: batch: A batch of Pulp2Content objects to migrate to Pulp 3 """ for pulp2content in batch: pulp_2to3_detail_content = pulp2content.detail_model.get() pulp3content = pulp_2to3_detail_content.create_pulp3_content() future_relations = {'pulp2content': pulp2content} if not pulp2content.downloaded: # on_demand content is partially migrated - only Content is created at this stage. # Remote Artifact and Content Artifact should be created at the time of # importers/remotes migration. Rely on downloaded flag on Pulp2Content to # identify on_demand content. dc = DeclarativeContent(content=pulp3content) else: artifact = await self.create_artifact( pulp2content.pulp2_storage_path, pulp_2to3_detail_content.expected_digests, pulp_2to3_detail_content.expected_size) da = DeclarativeArtifact( artifact=artifact, url=NOT_USED, relative_path=pulp_2to3_detail_content. relative_path_for_content_artifact, remote=NOT_USED, deferred_download=False) dc = DeclarativeContent(content=pulp3content, d_artifacts=[da]) dc.extra_data = future_relations await self.put(dc) if pb: pb.increment()
async def _parse_advisories(self, updates): progress_data = { "message": "Parsed Advisories", "code": "parsing.advisories", "total": len(updates), } with ProgressReport(**progress_data) as advisories_pb: for update in updates: update_record = UpdateRecord( **UpdateRecord.createrepo_to_dict(update)) update_record.digest = hash_update_record(update) future_relations = { "collections": defaultdict(list), "references": [] } for collection in update.collections: coll_dict = UpdateCollection.createrepo_to_dict(collection) coll = UpdateCollection(**coll_dict) for package in collection.packages: pkg_dict = UpdateCollectionPackage.createrepo_to_dict( package) pkg = UpdateCollectionPackage(**pkg_dict) future_relations["collections"][coll].append(pkg) for reference in update.references: reference_dict = UpdateReference.createrepo_to_dict( reference) ref = UpdateReference(**reference_dict) future_relations["references"].append(ref) advisories_pb.increment() dc = DeclarativeContent(content=update_record) dc.extra_data = future_relations await self.put(dc)
async def run(self): """ Build `DeclarativeContent` from the repodata. """ remote_url = self.new_url or self.remote.url remote_url = remote_url if remote_url[-1] == "/" else f"{remote_url}/" optimize_sync = self.optimize progress_data = dict(message='Downloading Metadata Files', code='downloading.metadata') with ProgressReport(**progress_data) as metadata_pb: downloader = self.remote.get_downloader( url=urljoin(remote_url, 'repodata/repomd.xml')) # TODO: decide how to distinguish between a mirror list and a normal repo result = await downloader.run() metadata_pb.increment() repomd_path = result.path repomd = cr.Repomd(repomd_path) # Caution: we are not storing when the remote was last updated, so the order of this # logic must remain in this order where we first check the version number as other # changes than sync could have taken place such that the date or repo version will be # different from last sync if (optimize_sync and self.repository.last_sync_remote and self.remote.pk == self.repository.last_sync_remote.pk and (self.repository.last_sync_repo_version == self.repository.latest_version().number) and (self.remote.pulp_last_updated <= self.repository.latest_version().pulp_created) and is_previous_version( repomd.revision, self.repository.last_sync_revision_number)): optimize_data = dict(message='Optimizing Sync', code='optimizing.sync') with ProgressReport(**optimize_data) as optimize_pb: optimize_pb.done = 1 optimize_pb.save() return self.repository.last_sync_revision_number = repomd.revision if self.treeinfo: d_artifacts = [ DeclarativeArtifact( artifact=Artifact(), url=urljoin(remote_url, self.treeinfo["filename"]), relative_path=".treeinfo", remote=self.remote, deferred_download=False, ) ] for path, checksum in self.treeinfo["download"][ "images"].items(): artifact = Artifact(**checksum) da = DeclarativeArtifact( artifact=artifact, url=urljoin(remote_url, path), relative_path=path, remote=self.remote, deferred_download=self.deferred_download) d_artifacts.append(da) distribution_tree = DistributionTree( **self.treeinfo["distribution_tree"]) dc = DeclarativeContent(content=distribution_tree, d_artifacts=d_artifacts) dc.extra_data = self.treeinfo await self.put(dc) package_repodata_urls = {} downloaders = [] modulemd_list = list() dc_groups = [] dc_categories = [] dc_environments = [] nevra_to_module = defaultdict(dict) pkgname_to_groups = defaultdict(list) group_to_categories = defaultdict(list) group_to_environments = defaultdict(list) optionalgroup_to_environments = defaultdict(list) modulemd_results = None comps_downloader = None main_types = set() checksums = {} for record in repomd.records: checksums[record.type] = record.checksum_type.upper() if record.type in PACKAGE_REPODATA: main_types.update([record.type]) package_repodata_urls[record.type] = urljoin( remote_url, record.location_href) elif record.type in UPDATE_REPODATA: updateinfo_url = urljoin(remote_url, record.location_href) downloader = self.remote.get_downloader(url=updateinfo_url) downloaders.append([downloader.run()]) elif record.type in COMPS_REPODATA: comps_url = urljoin(remote_url, record.location_href) comps_downloader = self.remote.get_downloader( url=comps_url) elif record.type in SKIP_REPODATA: continue elif '_zck' in record.type: continue elif record.type in MODULAR_REPODATA: modules_url = urljoin(remote_url, record.location_href) modulemd_downloader = self.remote.get_downloader( url=modules_url) modulemd_results = await modulemd_downloader.run() elif record.type not in PACKAGE_DB_REPODATA: file_data = { record.checksum_type: record.checksum, "size": record.size } da = DeclarativeArtifact( artifact=Artifact(**file_data), url=urljoin(remote_url, record.location_href), relative_path=record.location_href, remote=self.remote, deferred_download=False) repo_metadata_file = RepoMetadataFile( data_type=record.type, checksum_type=record.checksum_type, checksum=record.checksum, ) dc = DeclarativeContent(content=repo_metadata_file, d_artifacts=[da]) await self.put(dc) missing_type = set(PACKAGE_REPODATA) - main_types if missing_type: raise FileNotFoundError( _("XML file(s): {filename} not found").format( filename=", ".join(missing_type))) self.repository.original_checksum_types = checksums # we have to sync module.yaml first if it exists, to make relations to packages if modulemd_results: modulemd_index = mmdlib.ModuleIndex.new() open_func = gzip.open if modulemd_results.url.endswith( '.gz') else open with open_func(modulemd_results.path, 'r') as moduleyaml: content = moduleyaml.read() module_content = content if isinstance( content, str) else content.decode() modulemd_index.update_from_string(module_content, True) modulemd_names = modulemd_index.get_module_names() or [] modulemd_all = parse_modulemd(modulemd_names, modulemd_index) # Parsing modules happens all at one time, and from here on no useful work happens. # So just report that it finished this stage. modulemd_pb_data = { 'message': 'Parsed Modulemd', 'code': 'parsing.modulemds' } with ProgressReport(**modulemd_pb_data) as modulemd_pb: modulemd_total = len(modulemd_all) modulemd_pb.total = modulemd_total modulemd_pb.done = modulemd_total for modulemd in modulemd_all: artifact = modulemd.pop('artifact') relative_path = '{}{}{}{}{}snippet'.format( modulemd[PULP_MODULE_ATTR.NAME], modulemd[PULP_MODULE_ATTR.STREAM], modulemd[PULP_MODULE_ATTR.VERSION], modulemd[PULP_MODULE_ATTR.CONTEXT], modulemd[PULP_MODULE_ATTR.ARCH]) da = DeclarativeArtifact(artifact=artifact, relative_path=relative_path, url=modules_url) modulemd_content = Modulemd(**modulemd) dc = DeclarativeContent(content=modulemd_content, d_artifacts=[da]) dc.extra_data = defaultdict(list) # dc.content.artifacts are Modulemd artifacts for artifact in dc.content.artifacts: nevra_to_module.setdefault(artifact, set()).add(dc) modulemd_list.append(dc) # delete list now that we're done with it for memory savings del modulemd_all modulemd_default_names = parse_defaults(modulemd_index) # Parsing module-defaults happens all at one time, and from here on no useful # work happens. So just report that it finished this stage. modulemd_defaults_pb_data = { 'message': 'Parsed Modulemd-defaults', 'code': 'parsing.modulemd_defaults' } with ProgressReport( **modulemd_defaults_pb_data) as modulemd_defaults_pb: modulemd_defaults_total = len(modulemd_default_names) modulemd_defaults_pb.total = modulemd_defaults_total modulemd_defaults_pb.done = modulemd_defaults_total for default in modulemd_default_names: artifact = default.pop('artifact') relative_path = '{}{}snippet'.format( default[PULP_MODULEDEFAULTS_ATTR.MODULE], default[PULP_MODULEDEFAULTS_ATTR.STREAM]) da = DeclarativeArtifact(artifact=artifact, relative_path=relative_path, url=modules_url) default_content = ModulemdDefaults(**default) dc = DeclarativeContent(content=default_content, d_artifacts=[da]) await self.put(dc) # delete list now that we're done with it for memory savings del modulemd_default_names if comps_downloader: comps_result = await comps_downloader.run() comps = libcomps.Comps() comps.fromxml_f(comps_result.path) with ProgressReport(message='Parsed Comps', code='parsing.comps') as comps_pb: comps_total = (len(comps.groups) + len(comps.categories) + len(comps.environments)) comps_pb.total = comps_total comps_pb.done = comps_total if comps.langpacks: langpack_dict = PackageLangpacks.libcomps_to_dict( comps.langpacks) packagelangpack = PackageLangpacks( matches=strdict_to_dict(comps.langpacks), digest=dict_digest(langpack_dict)) dc = DeclarativeContent(content=packagelangpack) dc.extra_data = defaultdict(list) await self.put(dc) if comps.categories: for category in comps.categories: category_dict = PackageCategory.libcomps_to_dict( category) category_dict['digest'] = dict_digest(category_dict) packagecategory = PackageCategory(**category_dict) dc = DeclarativeContent(content=packagecategory) dc.extra_data = defaultdict(list) if packagecategory.group_ids: for group_id in packagecategory.group_ids: group_to_categories[group_id['name']].append( dc) dc_categories.append(dc) if comps.environments: for environment in comps.environments: environment_dict = PackageEnvironment.libcomps_to_dict( environment) environment_dict['digest'] = dict_digest( environment_dict) packageenvironment = PackageEnvironment( **environment_dict) dc = DeclarativeContent(content=packageenvironment) dc.extra_data = defaultdict(list) if packageenvironment.option_ids: for option_id in packageenvironment.option_ids: optionalgroup_to_environments[ option_id['name']].append(dc) if packageenvironment.group_ids: for group_id in packageenvironment.group_ids: group_to_environments[group_id['name']].append( dc) dc_environments.append(dc) if comps.groups: for group in comps.groups: group_dict = PackageGroup.libcomps_to_dict(group) group_dict['digest'] = dict_digest(group_dict) packagegroup = PackageGroup(**group_dict) dc = DeclarativeContent(content=packagegroup) dc.extra_data = defaultdict(list) if packagegroup.packages: for package in packagegroup.packages: pkgname_to_groups[package['name']].append(dc) if dc.content.id in group_to_categories.keys(): for dc_category in group_to_categories[ dc.content.id]: dc.extra_data['category_relations'].append( dc_category) dc_category.extra_data['packagegroups'].append( dc) if dc.content.id in group_to_environments.keys(): for dc_environment in group_to_environments[ dc.content.id]: dc.extra_data['environment_relations'].append( dc_environment) dc_environment.extra_data[ 'packagegroups'].append(dc) if dc.content.id in optionalgroup_to_environments.keys( ): for dc_environment in optionalgroup_to_environments[ dc.content.id]: dc.extra_data['env_relations_optional'].append( dc_environment) dc_environment.extra_data[ 'optionalgroups'].append(dc) dc_groups.append(dc) for dc_category in dc_categories: await self.put(dc_category) for dc_environment in dc_environments: await self.put(dc_environment) # delete lists now that we're done with them for memory savings del dc_environments del dc_categories # to preserve order, downloaders are created after all repodata urls are identified package_repodata_downloaders = [] for repodata_type in PACKAGE_REPODATA: downloader = self.remote.get_downloader( url=package_repodata_urls[repodata_type]) package_repodata_downloaders.append(downloader.run()) downloaders.append(package_repodata_downloaders) # asyncio.gather is used to preserve the order of results for package repodata pending = [ asyncio.gather(*downloaders_group) for downloaders_group in downloaders ] while pending: done, pending = await asyncio.wait( pending, return_when=asyncio.FIRST_COMPLETED) for downloader in done: try: results = downloader.result() except ClientResponseError as exc: raise HTTPNotFound( reason=_("File not found: {filename}").format( filename=exc.request_info.url)) if results[0].url == package_repodata_urls['primary']: primary_xml_path = results[0].path filelists_xml_path = results[1].path other_xml_path = results[2].path metadata_pb.done += 3 metadata_pb.save() packages = await RpmFirstStage.parse_repodata( primary_xml_path, filelists_xml_path, other_xml_path) # skip SRPM if defined if 'srpm' in self.skip_types: packages = { pkgId: pkg for pkgId, pkg in packages.items() if pkg.arch != 'src' } progress_data = { 'message': 'Parsed Packages', 'code': 'parsing.packages', 'total': len(packages), } with ProgressReport(**progress_data) as packages_pb: for pkg in packages.values(): package = Package( **Package.createrepo_to_dict(pkg)) artifact = Artifact(size=package.size_package) checksum_type = getattr( CHECKSUM_TYPES, package.checksum_type.upper()) setattr(artifact, checksum_type, package.pkgId) url = urljoin(remote_url, package.location_href) filename = os.path.basename( package.location_href) da = DeclarativeArtifact( artifact=artifact, url=url, relative_path=filename, remote=self.remote, deferred_download=self.deferred_download) dc = DeclarativeContent(content=package, d_artifacts=[da]) dc.extra_data = defaultdict(list) # find if a package relates to a modulemd if dc.content.nevra in nevra_to_module.keys(): dc.content.is_modular = True for dc_modulemd in nevra_to_module[ dc.content.nevra]: dc.extra_data[ 'modulemd_relation'].append( dc_modulemd) dc_modulemd.extra_data[ 'package_relation'].append(dc) if dc.content.name in pkgname_to_groups.keys(): for dc_group in pkgname_to_groups[ dc.content.name]: dc.extra_data[ 'group_relations'].append(dc_group) dc_group.extra_data[ 'related_packages'].append(dc) packages_pb.increment() await self.put(dc) elif results[0].url == updateinfo_url: updateinfo_xml_path = results[0].path metadata_pb.increment() updates = await RpmFirstStage.parse_updateinfo( updateinfo_xml_path) progress_data = { 'message': 'Parsed Advisories', 'code': 'parsing.advisories', 'total': len(updates), } with ProgressReport(**progress_data) as advisories_pb: for update in updates: update_record = UpdateRecord( **UpdateRecord.createrepo_to_dict(update)) update_record.digest = hash_update_record( update) future_relations = { 'collections': defaultdict(list), 'references': [] } for collection in update.collections: coll_dict = UpdateCollection.createrepo_to_dict( collection) coll = UpdateCollection(**coll_dict) for package in collection.packages: pkg_dict = UpdateCollectionPackage.createrepo_to_dict( package) pkg = UpdateCollectionPackage( **pkg_dict) future_relations['collections'][ coll].append(pkg) for reference in update.references: reference_dict = UpdateReference.createrepo_to_dict( reference) ref = UpdateReference(**reference_dict) future_relations['references'].append(ref) advisories_pb.increment() dc = DeclarativeContent(content=update_record) dc.extra_data = future_relations await self.put(dc) # now send modules down the pipeline since all relations have been set up for modulemd in modulemd_list: await self.put(modulemd) for dc_group in dc_groups: await self.put(dc_group)
async def run(self): """ Build `DeclarativeContent` from the repodata. """ packages_pb = ProgressBar(message='Parsed Packages') erratum_pb = ProgressBar(message='Parsed Erratum') packages_pb.save() erratum_pb.save() with ProgressBar(message='Downloading Metadata Files') as metadata_pb: downloader = self.remote.get_downloader( url=urljoin(self.remote.url, 'repodata/repomd.xml')) # TODO: decide how to distinguish between a mirror list and a normal repo result = await downloader.run() metadata_pb.increment() repomd_path = result.path repomd = cr.Repomd(repomd_path) package_repodata_urls = {} downloaders = [] for record in repomd.records: if record.type in PACKAGE_REPODATA: package_repodata_urls[record.type] = urljoin( self.remote.url, record.location_href) elif record.type in UPDATE_REPODATA: updateinfo_url = urljoin(self.remote.url, record.location_href) downloader = self.remote.get_downloader(url=updateinfo_url) downloaders.append([downloader.run()]) else: log.info( _('Unknown repodata type: {t}. Skipped.').format( t=record.type)) # TODO: skip databases, save unknown types to publish them as-is # to preserve order, downloaders are created after all repodata urls are identified package_repodata_downloaders = [] for repodata_type in PACKAGE_REPODATA: downloader = self.remote.get_downloader( url=package_repodata_urls[repodata_type]) package_repodata_downloaders.append(downloader.run()) downloaders.append(package_repodata_downloaders) # asyncio.gather is used to preserve the order of results for package repodata pending = [ asyncio.gather(*downloaders_group) for downloaders_group in downloaders ] while pending: done, pending = await asyncio.wait( pending, return_when=asyncio.FIRST_COMPLETED) for downloader in done: results = downloader.result() if results[0].url == package_repodata_urls['primary']: primary_xml_path = results[0].path filelists_xml_path = results[1].path other_xml_path = results[2].path metadata_pb.done += 3 metadata_pb.save() packages = await RpmFirstStage.parse_repodata( primary_xml_path, filelists_xml_path, other_xml_path) packages_pb.total = len(packages) packages_pb.state = 'running' packages_pb.save() for pkg in packages.values(): package = Package( **Package.createrepo_to_dict(pkg)) artifact = Artifact(size=package.size_package) checksum_type = getattr( CHECKSUM_TYPES, package.checksum_type.upper()) setattr(artifact, checksum_type, package.pkgId) url = urljoin(self.remote.url, package.location_href) filename = os.path.basename(package.location_href) da = DeclarativeArtifact( artifact=artifact, url=url, relative_path=filename, remote=self.remote, deferred_download=self.deferred_download) dc = DeclarativeContent(content=package, d_artifacts=[da]) packages_pb.increment() await self.put(dc) elif results[0].url == updateinfo_url: updateinfo_xml_path = results[0].path metadata_pb.increment() updates = await RpmFirstStage.parse_updateinfo( updateinfo_xml_path) erratum_pb.total = len(updates) erratum_pb.state = 'running' erratum_pb.save() for update in updates: update_record = UpdateRecord( **UpdateRecord.createrepo_to_dict(update)) update_record.digest = RpmFirstStage.hash_update_record( update) future_relations = { 'collections': defaultdict(list), 'references': [] } for collection in update.collections: coll_dict = UpdateCollection.createrepo_to_dict( collection) coll = UpdateCollection(**coll_dict) for package in collection.packages: pkg_dict = UpdateCollectionPackage.createrepo_to_dict( package) pkg = UpdateCollectionPackage(**pkg_dict) future_relations['collections'][ coll].append(pkg) for reference in update.references: reference_dict = UpdateReference.createrepo_to_dict( reference) ref = UpdateReference(**reference_dict) future_relations['references'].append(ref) erratum_pb.increment() dc = DeclarativeContent(content=update_record) dc.extra_data = future_relations await self.put(dc) packages_pb.state = 'completed' erratum_pb.state = 'completed' packages_pb.save() erratum_pb.save()
async def migrate_to_pulp3(self, content_model, content_type): """ A default implementation of DeclarativeContent creation for migrating content to Pulp 3. Plugin writers might want to override this method if it doesn't satisfy their needs as is. In this implementation there is an assumption that each content has one artifact. Args: batch: A batch of Pulp2Content objects to migrate to Pulp 3 migrator: A plugin migrator to be used content_type: type of pulp2 content that is being mirated """ @functools.lru_cache(maxsize=20) def get_remote_by_importer_id(importer_id): """ Args: importer_id(str): Id of an importer in Pulp 2 Returns: remote(pulpcore.app.models.Remote): A corresponding remote in Pulp 3 """ try: pulp2importer = Pulp2Importer.objects.get( pulp2_object_id=importer_id) except ObjectDoesNotExist: return return pulp2importer.pulp3_remote futures = [] is_lazy_type = content_type in self.migrator.lazy_types is_artifactless_type = content_type in self.migrator.artifactless_types has_future = content_type in self.migrator.future_types is_multi_artifact = content_type in self.migrator.multi_artifact_types if is_lazy_type: # go through all of the content that haven't been migrated OR have been migrated # but have new lazy catalog entries. units_with_new_lces = Pulp2LazyCatalog.objects.filter( is_migrated=False).values('pulp2_unit_id').distinct() already_migrated = ~Q(pulp2content__pulp3_content=None) no_new_lces = ~Q(pulp2content__pulp2_id__in=units_with_new_lces) pulp_2to3_detail_qs = content_model.objects.exclude( already_migrated & no_new_lces) else: # go through all of the content that haven't been migrated pulp_2to3_detail_qs = content_model.objects.filter( pulp2content__pulp3_content=None) # order by pulp2_repo if it's set if content_model.set_pulp2_repo: pulp_2to3_detail_qs = pulp_2to3_detail_qs.order_by('repo_id') with ProgressReport(message='Migrating {} content to Pulp 3 {}'.format( self.migrator.pulp2_plugin, content_type), code='migrating.{}.content'.format( self.migrator.pulp2_plugin), total=pulp_2to3_detail_qs.count()) as pb: select_extra = [ 'pulp2content', 'pulp2content__pulp3_content', ] if content_model.set_pulp2_repo: select_extra.append('pulp2content__pulp2_repo') pulp_2to3_detail_qs = pulp_2to3_detail_qs.select_related( *select_extra) for pulp_2to3_detail_content in pulp_2to3_detail_qs.iterator( chunk_size=800): dc = None pulp2content = pulp_2to3_detail_content.pulp2content # only content that supports on_demand download can have entries in LCE if is_lazy_type: # get all Lazy Catalog Entries (LCEs) for this content pulp2lazycatalog = Pulp2LazyCatalog.objects.filter( pulp2_unit_id=pulp2content.pulp2_id, is_migrated=False, ) if not pulp2content.downloaded and not pulp2lazycatalog: _logger.warn( _('On_demand content cannot be migrated without an entry in the ' 'lazy catalog, pulp2 unit_id: {}'.format( pulp2content.pulp2_id))) continue if pulp2content.pulp3_content is not None and is_lazy_type and pulp2lazycatalog: # find already created pulp3 content pulp3content = pulp2content.pulp3_content extra_info = None if is_multi_artifact: extra_info = pulp_2to3_detail_content.get_treeinfo_serialized( ) # If we can't find the .treeinfo for the Distribution, warn and skip if extra_info is None: _logger.warning( _("Failed to find or instantiate extra_info for multi-artifact " "pulp2 unit_id: {} ; skipping".format( pulp2content.pulp2_id))) continue else: # create pulp3 content and assign relations if present pulp3content, extra_info = pulp_2to3_detail_content.create_pulp3_content( ) # If we can't find/create the Distribution, warn and skip if pulp3content is None: _logger.warning( _("Failed to find or instantiate pulp3 content for pulp2 unit_id: {} ;" " skipping".format(pulp2content.pulp2_id))) continue future_relations = {'pulp2content': pulp2content} if extra_info: future_relations.update(extra_info) if is_multi_artifact: d_artifacts = [] base_path = pulp2content.pulp2_storage_path remotes = set() missing_artifact = False for image_relative_path in extra_info['download'][ 'images']: image_path = os.path.join(base_path, image_relative_path) downloaded = os.path.exists(image_path) if downloaded: artifact = await self.create_artifact( image_path, None, None, downloaded=downloaded) if artifact is None: continue else: artifact = Artifact() lces = pulp2lazycatalog.filter( pulp2_storage_path=image_path) if lces: remote_declarative_artifacts = [] for lce in lces: remote = get_remote_by_importer_id( lce.pulp2_importer_id) if not remote and not downloaded: continue remotes.add(remote) da = DeclarativeArtifact( artifact=artifact, url=lce.pulp2_url, relative_path=image_relative_path, remote=remote, deferred_download=not downloaded) remote_declarative_artifacts.append(da) if not remote_declarative_artifacts: missing_artifact = True break d_artifacts.extend(remote_declarative_artifacts) else: da = DeclarativeArtifact( artifact=artifact, url=NOT_USED, relative_path=image_relative_path, remote=None, deferred_download=False) d_artifacts.append(da) if missing_artifact: _logger.warn( _('On_demand content cannot be migrated without a remote ' 'pulp2 unit_id: {}'.format( pulp2content.pulp2_id))) continue for lce in pulp2lazycatalog: lce.is_migrated = True future_relations.update({'lces': list(pulp2lazycatalog)}) # We do this last because we need the remote url which is only found in the LCE # of the image files. There is no LCE for the .treeinfo file itself. relative_path = pulp_2to3_detail_content.relative_path_for_content_artifact treeinfo_path = os.path.join( pulp2content.pulp2_storage_path, relative_path) artifact = await self.create_artifact(treeinfo_path, None, None, downloaded=True) if artifact is None: continue if remotes: for remote in remotes: da = DeclarativeArtifact( artifact=artifact, url=urljoin(remote.url, relative_path), relative_path=relative_path, remote=remote, deferred_download=False, ) d_artifacts.append(da) else: da = DeclarativeArtifact( artifact=artifact, url=NOT_USED, relative_path=relative_path, remote=None, deferred_download=False, ) d_artifacts.append(da) dc = DeclarativeContent(content=pulp3content, d_artifacts=d_artifacts) dc.extra_data = future_relations await self.put(dc) # not all content units have files, create DC without artifact elif is_artifactless_type: # dc without artifact dc = DeclarativeContent(content=pulp3content) dc.extra_data = future_relations await self.put(dc) else: # create artifact for content that has file artifact = await self.create_artifact( pulp2content.pulp2_storage_path, pulp_2to3_detail_content.expected_digests, pulp_2to3_detail_content.expected_size, downloaded=pulp2content.downloaded) if artifact is None: continue if is_lazy_type and pulp2lazycatalog: # handle DA and RA creation for content that supports on_demand # Downloaded or on_demand content with LCEs. # # To create multiple remote artifacts, create multiple instances of # declarative content which will differ by url/remote in their # declarative artifacts at_least_one_lce_migrated = False for lce in pulp2lazycatalog: remote = get_remote_by_importer_id( lce.pulp2_importer_id) deferred_download = not pulp2content.downloaded if not remote and deferred_download: continue relative_path = ( pulp_2to3_detail_content. relative_path_for_content_artifact) da = DeclarativeArtifact( artifact=artifact, url=lce.pulp2_url, relative_path=relative_path, remote=remote, deferred_download=deferred_download) lce.is_migrated = True at_least_one_lce_migrated = True dc = DeclarativeContent(content=pulp3content, d_artifacts=[da]) dc.extra_data = future_relations await self.put(dc) if not at_least_one_lce_migrated: _logger.warn( _('On_demand content cannot be migrated without a remote ' 'pulp2 unit_id: {}'.format( pulp2content.pulp2_id))) future_relations.update( {'lces': list(pulp2lazycatalog)}) else: relative_path = (pulp_2to3_detail_content. relative_path_for_content_artifact) da = DeclarativeArtifact(artifact=artifact, url=NOT_USED, relative_path=relative_path, remote=None, deferred_download=False) dc = DeclarativeContent(content=pulp3content, d_artifacts=[da]) dc.extra_data = future_relations await self.put(dc) if pb: pb.increment() if has_future and dc: futures.append(dc) resolve_futures = len( futures) >= DEFAULT_BATCH_SIZE or pb.done == pb.total if resolve_futures: for dc in futures: await dc.resolution() futures.clear()
async def run(self): """ Build `DeclarativeContent` from the repodata. """ packages_pb = ProgressReport(message='Parsed Packages', code='parsing.packages') errata_pb = ProgressReport(message='Parsed Erratum', code='parsing.errata') modulemd_pb = ProgressReport(message='Parse Modulemd', code='parsing.modulemds') modulemd_defaults_pb = ProgressReport( message='Parse Modulemd-defaults', code='parsing.modulemddefaults') comps_pb = ProgressReport(message='Parsed Comps', code='parsing.comps') packages_pb.save() errata_pb.save() comps_pb.save() remote_url = self.new_url or self.remote.url remote_url = remote_url if remote_url[-1] == "/" else f"{remote_url}/" progress_data = dict(message='Downloading Metadata Files', code='downloading.metadata') with ProgressReport(**progress_data) as metadata_pb: downloader = self.remote.get_downloader( url=urljoin(remote_url, 'repodata/repomd.xml')) # TODO: decide how to distinguish between a mirror list and a normal repo result = await downloader.run() metadata_pb.increment() if self.kickstart: d_artifacts = [] for path, checksum in self.kickstart["download"][ "images"].items(): artifact = Artifact(**checksum) da = DeclarativeArtifact( artifact=artifact, url=urljoin(remote_url, path), relative_path=path, remote=self.remote, deferred_download=self.deferred_download) d_artifacts.append(da) distribution_tree = DistributionTree( **self.kickstart["distribution_tree"]) dc = DeclarativeContent(content=distribution_tree, d_artifacts=d_artifacts) dc.extra_data = self.kickstart await self.put(dc) repomd_path = result.path repomd = cr.Repomd(repomd_path) package_repodata_urls = {} downloaders = [] modulemd_list = list() dc_groups = [] dc_categories = [] dc_environments = [] nevra_to_module = defaultdict(dict) pkgname_to_groups = defaultdict(list) group_to_categories = defaultdict(list) group_to_environments = defaultdict(list) optionalgroup_to_environments = defaultdict(list) modulemd_results = None comps_downloader = None for record in repomd.records: if record.type in PACKAGE_REPODATA: package_repodata_urls[record.type] = urljoin( remote_url, record.location_href) elif record.type in UPDATE_REPODATA: updateinfo_url = urljoin(remote_url, record.location_href) downloader = self.remote.get_downloader(url=updateinfo_url) downloaders.append([downloader.run()]) elif record.type in COMPS_REPODATA: comps_url = urljoin(remote_url, record.location_href) comps_downloader = self.remote.get_downloader( url=comps_url) elif record.type in SKIP_REPODATA: continue elif record.type in MODULAR_REPODATA: modules_url = urljoin(remote_url, record.location_href) modulemd_downloader = self.remote.get_downloader( url=modules_url) modulemd_results = await modulemd_downloader.run() elif record.type not in PACKAGE_DB_REPODATA: file_data = { record.checksum_type: record.checksum, "size": record.size } da = DeclarativeArtifact( artifact=Artifact(**file_data), url=urljoin(remote_url, record.location_href), relative_path=record.location_href, remote=self.remote, deferred_download=False) repo_metadata_file = RepoMetadataFile( data_type=record.type, checksum_type=record.checksum_type, checksum=record.checksum, ) dc = DeclarativeContent(content=repo_metadata_file, d_artifacts=[da]) await self.put(dc) # we have to sync module.yaml first if it exists, to make relations to packages if modulemd_results: modulemd_index = mmdlib.ModuleIndex.new() open_func = gzip.open if modulemd_results.url.endswith( '.gz') else open with open_func(modulemd_results.path, 'r') as moduleyaml: modulemd_index.update_from_string( moduleyaml.read().decode(), True) modulemd_names = modulemd_index.get_module_names() or [] modulemd_all = parse_modulemd(modulemd_names, modulemd_index) modulemd_pb.total = len(modulemd_all) modulemd_pb.state = 'running' modulemd_pb.save() for modulemd in modulemd_all: artifact = modulemd.pop('artifact') relative_path = '{}{}{}{}{}snippet'.format( modulemd[PULP_MODULE_ATTR.NAME], modulemd[PULP_MODULE_ATTR.STREAM], modulemd[PULP_MODULE_ATTR.VERSION], modulemd[PULP_MODULE_ATTR.CONTEXT], modulemd[PULP_MODULE_ATTR.ARCH]) da = DeclarativeArtifact(artifact=artifact, relative_path=relative_path, url=modules_url) modulemd_content = Modulemd(**modulemd) dc = DeclarativeContent(content=modulemd_content, d_artifacts=[da]) dc.extra_data = defaultdict(list) # dc.content.artifacts are Modulemd artifacts for artifact in json.loads(dc.content.artifacts): nevra_to_module.setdefault(artifact, set()).add(dc) modulemd_list.append(dc) modulemd_default_names = parse_defaults(modulemd_index) modulemd_defaults_pb.total = len(modulemd_default_names) modulemd_defaults_pb.state = 'running' modulemd_defaults_pb.save() for default in modulemd_default_names: artifact = default.pop('artifact') relative_path = '{}{}snippet'.format( default[PULP_MODULEDEFAULTS_ATTR.MODULE], default[PULP_MODULEDEFAULTS_ATTR.STREAM]) da = DeclarativeArtifact(artifact=artifact, relative_path=relative_path, url=modules_url) default_content = ModulemdDefaults(**default) modulemd_defaults_pb.increment() dc = DeclarativeContent(content=default_content, d_artifacts=[da]) await self.put(dc) if comps_downloader: comps_result = await comps_downloader.run() comps = libcomps.Comps() comps.fromxml_f(comps_result.path) comps_pb.total = (len(comps.groups) + len(comps.categories) + len(comps.environments)) comps_pb.state = 'running' comps_pb.save() if comps.langpacks: langpack_dict = PackageLangpacks.libcomps_to_dict( comps.langpacks) packagelangpack = PackageLangpacks( matches=strdict_to_dict(comps.langpacks), digest=dict_digest(langpack_dict)) dc = DeclarativeContent(content=packagelangpack) dc.extra_data = defaultdict(list) await self.put(dc) if comps.categories: for category in comps.categories: category_dict = PackageCategory.libcomps_to_dict( category) category_dict['digest'] = dict_digest(category_dict) packagecategory = PackageCategory(**category_dict) dc = DeclarativeContent(content=packagecategory) dc.extra_data = defaultdict(list) if packagecategory.group_ids: for group_id in packagecategory.group_ids: group_to_categories[group_id['name']].append( dc) dc_categories.append(dc) if comps.environments: for environment in comps.environments: environment_dict = PackageEnvironment.libcomps_to_dict( environment) environment_dict['digest'] = dict_digest( environment_dict) packageenvironment = PackageEnvironment( **environment_dict) dc = DeclarativeContent(content=packageenvironment) dc.extra_data = defaultdict(list) if packageenvironment.option_ids: for option_id in packageenvironment.option_ids: optionalgroup_to_environments[ option_id['name']].append(dc) if packageenvironment.group_ids: for group_id in packageenvironment.group_ids: group_to_environments[group_id['name']].append( dc) dc_environments.append(dc) if comps.groups: for group in comps.groups: group_dict = PackageGroup.libcomps_to_dict(group) group_dict['digest'] = dict_digest(group_dict) packagegroup = PackageGroup(**group_dict) dc = DeclarativeContent(content=packagegroup) dc.extra_data = defaultdict(list) if packagegroup.packages: for package in packagegroup.packages: pkgname_to_groups[package['name']].append(dc) if dc.content.id in group_to_categories.keys(): for dc_category in group_to_categories[ dc.content.id]: dc.extra_data['category_relations'].append( dc_category) dc_category.extra_data['packagegroups'].append( dc) if dc.content.id in group_to_environments.keys(): for dc_environment in group_to_environments[ dc.content.id]: dc.extra_data['environment_relations'].append( dc_environment) dc_environment.extra_data[ 'packagegroups'].append(dc) if dc.content.id in optionalgroup_to_environments.keys( ): for dc_environment in optionalgroup_to_environments[ dc.content.id]: dc.extra_data['env_relations_optional'].append( dc_environment) dc_environment.extra_data[ 'optionalgroups'].append(dc) dc_groups.append(dc) for dc_category in dc_categories: comps_pb.increment() await self.put(dc_category) for dc_environment in dc_environments: comps_pb.increment() await self.put(dc_environment) # to preserve order, downloaders are created after all repodata urls are identified package_repodata_downloaders = [] for repodata_type in PACKAGE_REPODATA: downloader = self.remote.get_downloader( url=package_repodata_urls[repodata_type]) package_repodata_downloaders.append(downloader.run()) downloaders.append(package_repodata_downloaders) # asyncio.gather is used to preserve the order of results for package repodata pending = [ asyncio.gather(*downloaders_group) for downloaders_group in downloaders ] while pending: done, pending = await asyncio.wait( pending, return_when=asyncio.FIRST_COMPLETED) for downloader in done: results = downloader.result() if results[0].url == package_repodata_urls['primary']: primary_xml_path = results[0].path filelists_xml_path = results[1].path other_xml_path = results[2].path metadata_pb.done += 3 metadata_pb.save() packages = await RpmFirstStage.parse_repodata( primary_xml_path, filelists_xml_path, other_xml_path) packages_pb.total = len(packages) packages_pb.state = 'running' packages_pb.save() for pkg in packages.values(): package = Package( **Package.createrepo_to_dict(pkg)) artifact = Artifact(size=package.size_package) checksum_type = getattr( CHECKSUM_TYPES, package.checksum_type.upper()) setattr(artifact, checksum_type, package.pkgId) url = urljoin(remote_url, package.location_href) filename = os.path.basename(package.location_href) da = DeclarativeArtifact( artifact=artifact, url=url, relative_path=filename, remote=self.remote, deferred_download=self.deferred_download) dc = DeclarativeContent(content=package, d_artifacts=[da]) dc.extra_data = defaultdict(list) # find if a package relates to a modulemd if dc.content.nevra in nevra_to_module.keys(): dc.content.is_modular = True for dc_modulemd in nevra_to_module[ dc.content.nevra]: dc.extra_data['modulemd_relation'].append( dc_modulemd) dc_modulemd.extra_data[ 'package_relation'].append(dc) if dc.content.name in pkgname_to_groups.keys(): for dc_group in pkgname_to_groups[ dc.content.name]: dc.extra_data['group_relations'].append( dc_group) dc_group.extra_data[ 'related_packages'].append(dc) packages_pb.increment() await self.put(dc) elif results[0].url == updateinfo_url: updateinfo_xml_path = results[0].path metadata_pb.increment() updates = await RpmFirstStage.parse_updateinfo( updateinfo_xml_path) errata_pb.total = len(updates) errata_pb.state = 'running' errata_pb.save() for update in updates: update_record = UpdateRecord( **UpdateRecord.createrepo_to_dict(update)) update_record.digest = RpmFirstStage.hash_update_record( update) future_relations = { 'collections': defaultdict(list), 'references': [] } for collection in update.collections: coll_dict = UpdateCollection.createrepo_to_dict( collection) coll = UpdateCollection(**coll_dict) for package in collection.packages: pkg_dict = UpdateCollectionPackage.createrepo_to_dict( package) pkg = UpdateCollectionPackage(**pkg_dict) future_relations['collections'][ coll].append(pkg) for reference in update.references: reference_dict = UpdateReference.createrepo_to_dict( reference) ref = UpdateReference(**reference_dict) future_relations['references'].append(ref) errata_pb.increment() dc = DeclarativeContent(content=update_record) dc.extra_data = future_relations await self.put(dc) # now send modules down the pipeline since all relations have been set up for modulemd in modulemd_list: modulemd_pb.increment() await self.put(modulemd) for dc_group in dc_groups: comps_pb.increment() await self.put(dc_group) packages_pb.state = 'completed' errata_pb.state = 'completed' modulemd_pb.state = 'completed' modulemd_defaults_pb.state = 'completed' comps_pb.state = 'completed' packages_pb.save() errata_pb.save() modulemd_pb.save() modulemd_defaults_pb.save() comps_pb.save()
async def migrate_to_pulp3(self, batch, pb=None): """ A default implementation of DeclarativeContent creation for migrating content to Pulp 3. Plugin writers might want to override this method if it doesn't satisfy their needs as is. In this implementation there is an assumption that each content has one artifact. Args: batch: A batch of Pulp2Content objects to migrate to Pulp 3 """ def get_remote_by_importer_id(importer_id): """ Args: importer_id(str): Id of an importer in Pulp 2 Returns: remote(pulpcore.app.models.Remote): A corresponding remote in Pulp 3 """ try: pulp2importer = Pulp2Importer.objects.get( pulp2_object_id=importer_id) except ObjectDoesNotExist: return return pulp2importer.pulp3_remote for pulp2content in batch: pulp_2to3_detail_content = pulp2content.detail_model # get all Lazy Catalog Entries (LCEs) for this content pulp2lazycatalog = Pulp2LazyCatalog.objects.filter( pulp2_unit_id=pulp2content.pulp2_id) if not pulp2lazycatalog and not pulp2content.downloaded: _logger.warn( _('On_demand content cannot be migrated without an entry in the lazy ' 'catalog, pulp2 unit_id: {}'.format( pulp2content.pulp2_id))) continue pulp3content = await pulp_2to3_detail_content.create_pulp3_content( ) future_relations = {'pulp2content': pulp2content} artifact = await self.create_artifact( pulp2content.pulp2_storage_path, pulp_2to3_detail_content.expected_digests, pulp_2to3_detail_content.expected_size, downloaded=pulp2content.downloaded) # Downloaded content with no LCE if not pulp2lazycatalog and pulp2content.downloaded: da = DeclarativeArtifact( artifact=artifact, url=NOT_USED, relative_path=pulp_2to3_detail_content. relative_path_for_content_artifact, remote=None, deferred_download=False) dc = DeclarativeContent(content=pulp3content, d_artifacts=[da]) dc.extra_data = future_relations await self.put(dc) # Downloaded or on_demand content with LCEs. # # To create multiple remote artifacts, create multiple instances of declarative # content which will differ by url/remote in their declarative artifacts for lce in pulp2lazycatalog: remote = get_remote_by_importer_id(lce.pulp2_importer_id) deferred_download = not pulp2content.downloaded if not remote and deferred_download: _logger.warn( _('On_demand content cannot be migrated without a remote ' 'pulp2 unit_id: {}'.format(pulp2content.pulp2_id))) continue da = DeclarativeArtifact( artifact=artifact, url=lce.pulp2_url, relative_path=pulp_2to3_detail_content. relative_path_for_content_artifact, remote=remote, deferred_download=deferred_download) dc = DeclarativeContent(content=pulp3content, d_artifacts=[da]) dc.extra_data = future_relations await self.put(dc) if pb: pb.increment()
async def migrate_to_pulp3(self, content_model, content_type): """ A default implementation of DeclarativeContent creation for migrating content to Pulp 3. Plugin writers might want to override this method if it doesn't satisfy their needs as is. In this implementation there is an assumption that each content has one artifact. Args: batch: A batch of Pulp2Content objects to migrate to Pulp 3 migrator: A plugin migrator to be used content_type: type of pulp2 content that is being mirated """ @functools.lru_cache(maxsize=20) def get_remote_by_importer_id(importer_id): """ Args: importer_id(str): Id of an importer in Pulp 2 Returns: remote(pulpcore.app.models.Remote): A corresponding remote in Pulp 3 """ try: pulp2importer = Pulp2Importer.objects.get( pulp2_object_id=importer_id) except ObjectDoesNotExist: return return pulp2importer.pulp3_remote futures = [] is_lazy_type = content_type in self.migrator.lazy_types is_artifactless_type = content_type in self.migrator.artifactless_types has_future = content_type in self.migrator.future_types is_multi_artifact = content_type in self.migrator.multi_artifact_types if is_lazy_type: # go through all of the content that haven't been migrated OR have been migrated # but have new lazy catalog entries. units_with_new_lces = (Pulp2LazyCatalog.objects.filter( is_migrated=False).values("pulp2_unit_id").distinct()) already_migrated = ~Q(pulp2content__pulp3_content=None) no_new_lces = ~Q(pulp2content__pulp2_id__in=units_with_new_lces) pulp_2to3_detail_qs = content_model.objects.exclude( already_migrated & no_new_lces) else: # go through all of the content that haven't been migrated pulp_2to3_detail_qs = content_model.objects.filter( pulp2content__pulp3_content=None) # order by pulp2_repo if it's set if content_model.set_pulp2_repo: pulp_2to3_detail_qs = pulp_2to3_detail_qs.order_by("repo_id") async with ProgressReport( message="Migrating {} content to Pulp 3".format(content_type), code="migrating.{}.content".format(self.migrator.pulp2_plugin), total=await sync_to_async(pulp_2to3_detail_qs.count)(), ) as pb: select_extra = [ "pulp2content", "pulp2content__pulp3_content", ] if content_model.set_pulp2_repo: select_extra.append("pulp2content__pulp2_repo") pulp_2to3_detail_qs = pulp_2to3_detail_qs.select_related( *select_extra) async for pulp_2to3_detail_content in sync_to_async_iterable( pulp_2to3_detail_qs.iterator(chunk_size=800)): dc = None pulp2content = await sync_to_async( Pulp2Content.objects.get )(pk=pulp_2to3_detail_content.pulp2content.pk) # only content that supports on_demand download can have entries in LCE if is_lazy_type: # get all Lazy Catalog Entries (LCEs) for this content pulp2lazycatalog = Pulp2LazyCatalog.objects.filter( pulp2_unit_id=pulp2content.pulp2_id, is_migrated=False, ) await sync_to_async(bool)(pulp2lazycatalog ) # force queryset to evaluate if not pulp2content.downloaded and not pulp2lazycatalog: # A distribution tree can be from an on_demand repo but without any images, # e.g. CentOS 8 High Availability. Do not skip in that case. if not is_multi_artifact: _logger.warn( _("On_demand content cannot be migrated without an entry in the " "lazy catalog, pulp2 unit_id: {}".format( pulp2content.pulp2_id))) continue if (pulp2content.pulp3_content is not None and is_lazy_type and pulp2lazycatalog): # find already created pulp3 content pulp3content = pulp2content.pulp3_content extra_info = None if is_multi_artifact: extra_info = pulp_2to3_detail_content.get_treeinfo_serialized( ) # If we can't find the .treeinfo for the Distribution, warn and skip if extra_info is None: _logger.warning( _("Failed to find or instantiate extra_info for multi-artifact " "pulp2 unit_id: {} ; skipping".format( pulp2content.pulp2_id))) continue else: # create pulp3 content and assign relations if present pulp3content, extra_info = await sync_to_async( pulp_2to3_detail_content.create_pulp3_content)() # If we can't find/create the Distribution, warn and skip if pulp3content is None: _logger.warning( _("Failed to find or instantiate pulp3 content for pulp2 unit_id: {} ;" " skipping".format(pulp2content.pulp2_id))) continue future_relations = {"pulp2content": pulp2content} if extra_info: future_relations.update(extra_info) if is_multi_artifact: d_artifacts = [] base_path = pulp2content.pulp2_storage_path remotes = set() missing_artifact = False remote_declarative_artifacts = [] for image_relative_path in extra_info["download"][ "images"]: remote_url_tuples = [] image_path = os.path.join(base_path, image_relative_path) downloaded = os.path.exists(image_path) if downloaded: artifact = await self.create_artifact( image_path, None, None, downloaded=downloaded) if artifact is None: continue else: artifact = Artifact() lces = await sync_to_async(list)( pulp2lazycatalog.filter( pulp2_storage_path=image_path)) if not lces and not downloaded: continue # collect all urls and respective migrated remotes for the image for lce in lces: remote = await sync_to_async( get_remote_by_importer_id)( lce.pulp2_importer_id) if remote: remotes.add(remote) remote_url_tuples.append( (remote, lce.pulp2_url)) for remote, url in remote_url_tuples: da = DeclarativeArtifact( artifact=artifact, url=lce.pulp2_url, relative_path=image_relative_path, remote=remote, deferred_download=not downloaded, ) remote_declarative_artifacts.append(da) if not remote_url_tuples: # either no LCEs existed but it's a downloaded content (and we can # proceed), or remotes for any of LCEs haven't been migrated (and # nothing can be done at this point) if not downloaded: missing_artifact = True break da = DeclarativeArtifact( artifact=artifact, url=NOT_USED, relative_path=image_relative_path, remote=None, deferred_download=False, ) d_artifacts.append(da) d_artifacts.extend(remote_declarative_artifacts) # Only skip the rest of the steps if there are any images that are expected # to be downloaded. There are distribution trees without images in the wild, # e.g. CentOS 8 High Availability. if missing_artifact and extra_info["download"]["images"]: _logger.warn( _("On_demand content cannot be migrated without a remote " "pulp2 unit_id: {}".format( pulp2content.pulp2_id))) continue for lce in pulp2lazycatalog: lce.is_migrated = True future_relations.update({"lces": list(pulp2lazycatalog)}) # We do this last because we need the remote url which is only found in the LCE # of the image files. There is no LCE for the .treeinfo file itself. relative_path = (pulp_2to3_detail_content. relative_path_for_content_artifact) treeinfo_path = os.path.join( pulp2content.pulp2_storage_path, relative_path) artifact = await self.create_artifact(treeinfo_path, None, None, downloaded=True) if artifact is None: continue if remotes: for remote in remotes: da = DeclarativeArtifact( artifact=artifact, url=os.path.join(remote.url, relative_path), relative_path=relative_path, remote=remote, deferred_download=False, ) d_artifacts.append(da) else: da = DeclarativeArtifact( artifact=artifact, url=NOT_USED, relative_path=relative_path, remote=None, deferred_download=False, ) d_artifacts.append(da) dc = DeclarativeContent(content=pulp3content, d_artifacts=d_artifacts) dc.extra_data = future_relations await self.put(dc) # not all content units have files, create DC without artifact elif is_artifactless_type: # dc without artifact dc = DeclarativeContent(content=pulp3content) dc.extra_data = future_relations await self.put(dc) else: # create artifact for content that has file artifact = await self.create_artifact( pulp2content.pulp2_storage_path, pulp_2to3_detail_content.expected_digests, pulp_2to3_detail_content.expected_size, downloaded=pulp2content.downloaded, ) if artifact is None: if pb: await pb.aincrement() continue relative_path = (pulp_2to3_detail_content. relative_path_for_content_artifact) remote_lce_tuples = [] deferred_download = not pulp2content.downloaded if is_lazy_type and pulp2lazycatalog: for lce in pulp2lazycatalog: remote = await sync_to_async( get_remote_by_importer_id)( lce.pulp2_importer_id) if remote: remote_lce_tuples.append((remote, lce)) # handle DA and RA creation for content that supports on_demand # Downloaded or on_demand content with LCEs. # # To create multiple remote artifacts, create multiple instances of # declarative content which will differ by url/remote in their # declarative artifacts if remote_lce_tuples: for remote, lce in remote_lce_tuples: da = DeclarativeArtifact( artifact=artifact, url=lce.pulp2_url, relative_path=relative_path, remote=remote, deferred_download=deferred_download, ) lce.is_migrated = True dc = DeclarativeContent(content=pulp3content, d_artifacts=[da]) # yes, all LCEs are assigned for each dc to be resolved at a later # stage. Some LCEs might be "bad" and not have a migrated importer # but we still need to resolved such. It creates some duplicated LCEs # to process later but ensures that all are resolved if at least one # valid one is migrated. future_relations.update( {"lces": list(pulp2lazycatalog)}) dc.extra_data = future_relations await self.put(dc) else: # No migratable LCE available if deferred_download: _logger.warn( _("On_demand content cannot be migrated without a remote " "pulp2 unit_id: {}".format( pulp2content.pulp2_id))) continue da = DeclarativeArtifact( artifact=artifact, url=NOT_USED, relative_path=relative_path, remote=None, deferred_download=False, ) dc = DeclarativeContent(content=pulp3content, d_artifacts=[da]) dc.extra_data = future_relations await self.put(dc) if pb: await pb.aincrement() if has_future and dc: futures.append(dc) resolve_futures = len(futures) >= DEFAULT_BATCH_SIZE if resolve_futures: for dc in futures: await dc.resolution() futures.clear() # resolve futures if there are any left for dc in futures: await dc.resolution() futures.clear()