async def _should_we_sync(self): """Check last synced metadata time.""" msg = _("no-op: Checking if remote changed since last sync.") noop = ProgressReport(message=msg, code="noop") noop.state = TASK_STATES.COMPLETED noop.save() if not self.repository.remote: return True if self.remote != self.repository.remote.cast(): return True root, api_version = await self._get_root_api(self.remote.url) if api_version == 3: downloader = self.remote.get_downloader( url=root, silence_errors_for_response_status_codes={404}) try: metadata = parse_metadata(await downloader.run()) except FileNotFoundError: return True try: self.last_synced_metadata_time = parse_datetime( metadata["published"]) except KeyError: return True sources = set() if self.collection_info: sources = {r.source for r in self.collection_info if r.source} sources.add(self.remote.url) if len(sources) > 1: return True if self.last_synced_metadata_time == self.repository.last_synced_metadata_time: noop.message = _( "no-op: {remote} did not change since last sync - {published}" .format(remote=self.remote.url, published=self.last_synced_metadata_time)) noop.save() return False return True
async def run(self): """ ContainerFirstStage. """ future_manifests = [] tag_list = [] to_download = [] man_dcs = {} total_blobs = [] with ProgressReport( message='Downloading tag list', code='downloading.tag_list', total=1 ) as pb: repo_name = self.remote.namespaced_upstream_name relative_url = '/v2/{name}/tags/list'.format(name=repo_name) tag_list_url = urljoin(self.remote.url, relative_url) list_downloader = self.remote.get_downloader(url=tag_list_url) await list_downloader.run(extra_data={'repo_name': repo_name}) with open(list_downloader.path) as tags_raw: tags_dict = json.loads(tags_raw.read()) tag_list = tags_dict['tags'] # check for the presence of the pagination link header link = list_downloader.response_headers.get('Link') await self.handle_pagination(link, repo_name, tag_list) tag_list = self.filter_tags(tag_list) pb.increment() for tag_name in tag_list: relative_url = '/v2/{name}/manifests/{tag}'.format( name=self.remote.namespaced_upstream_name, tag=tag_name, ) url = urljoin(self.remote.url, relative_url) downloader = self.remote.get_downloader(url=url) to_download.append(downloader.run(extra_data={'headers': V2_ACCEPT_HEADERS})) pb_parsed_tags = ProgressReport( message='Processing Tags', code='processing.tag', state=TASK_STATES.RUNNING, total=len(tag_list) ) for download_tag in asyncio.as_completed(to_download): tag = await download_tag with open(tag.path, 'rb') as content_file: raw_data = content_file.read() content_data = json.loads(raw_data) media_type = content_data.get('mediaType') tag.artifact_attributes['file'] = tag.path saved_artifact = Artifact(**tag.artifact_attributes) try: saved_artifact.save() except IntegrityError: del tag.artifact_attributes['file'] saved_artifact = Artifact.objects.get(**tag.artifact_attributes) tag_dc = self.create_tag(saved_artifact, tag.url) if media_type in (MEDIA_TYPE.MANIFEST_LIST, MEDIA_TYPE.INDEX_OCI): list_dc = self.create_tagged_manifest_list( tag_dc, content_data) await self.put(list_dc) tag_dc.extra_data['man_relation'] = list_dc for manifest_data in content_data.get('manifests'): man_dc = self.create_manifest(list_dc, manifest_data) future_manifests.append(man_dc) man_dcs[man_dc.content.digest] = man_dc await self.put(man_dc) else: man_dc = self.create_tagged_manifest(tag_dc, content_data, raw_data) await self.put(man_dc) tag_dc.extra_data['man_relation'] = man_dc self.handle_blobs(man_dc, content_data, total_blobs) await self.put(tag_dc) pb_parsed_tags.increment() pb_parsed_tags.state = 'completed' pb_parsed_tags.save() for manifest_future in future_manifests: man = await manifest_future.resolution() with man._artifacts.get().file.open() as content_file: raw = content_file.read() content_data = json.loads(raw) man_dc = man_dcs[man.digest] self.handle_blobs(man_dc, content_data, total_blobs) for blob in total_blobs: await self.put(blob)
async def pre_migrate_content(content_model, mutable_type, premigrate_hook): """ A coroutine to pre-migrate Pulp 2 content, including all details for on_demand content. Args: content_model: Models for content which is being migrated. mutable_type: Boolean that indicates whether the content type is mutable. """ batch_size = 1000 content_type = content_model.pulp2.TYPE_ID pulp2content = [] pulp2mutatedcontent = [] # the latest timestamp we have in the migration tool Pulp2Content table for this content type content_qs = Pulp2Content.objects.filter( pulp2_content_type_id=content_type) last_updated = content_qs.aggregate( Max('pulp2_last_updated'))['pulp2_last_updated__max'] or 0 _logger.debug( 'The latest migrated {type} content has {timestamp} timestamp.'.format( type=content_type, timestamp=last_updated)) if premigrate_hook: pulp2_content_ids = premigrate_hook() mongo_content_qs = content_model.pulp2.objects( _last_updated__gte=last_updated, id__in=pulp2_content_ids) else: # query only newly created/updated items mongo_content_qs = content_model.pulp2.objects( _last_updated__gte=last_updated) total_content = mongo_content_qs.count() _logger.debug('Total count for {type} content to migrate: {total}'.format( type=content_type, total=total_content)) pulp2content_pb = ProgressReport( message='Pre-migrating Pulp 2 {} content (general info)'.format( content_type.upper()), code='premigrating.content.general', total=total_content, state=TASK_STATES.RUNNING) pulp2content_pb.save() pulp2detail_pb = ProgressReport( message='Pre-migrating Pulp 2 {} content (detail info)'.format( content_type.upper()), code='premigrating.content.detail', total=total_content, state=TASK_STATES.RUNNING) pulp2detail_pb.save() existing_count = 0 fields = set(['id', '_storage_path', '_last_updated', '_content_type_id']) if hasattr(content_model.pulp2, 'downloaded'): fields.add('downloaded') for i, record in enumerate( mongo_content_qs.only(*fields).batch_size(batch_size)): if record._last_updated == last_updated: # corner case - content with the last``last_updated`` date might be pre-migrated; # check if this content is already pre-migrated migrated = Pulp2Content.objects.filter( pulp2_last_updated=last_updated, pulp2_id=record.id) if migrated: existing_count += 1 # it has to be updated here and not later, in case all items were migrated before # and no new content will be saved. pulp2content_pb.total -= 1 pulp2content_pb.save() pulp2detail_pb.total -= 1 pulp2detail_pb.save() else: if mutable_type: # This is a mutable content type. Query for the existing pulp2content. # If one was found, it means that the migrated content is older than the incoming. # Detele outdated migrated pulp2content and create a new pulp2content try: outdated = Pulp2Content.objects.get(pulp2_id=record.id) except Pulp2Content.DoesNotExist: pass else: pulp2mutatedcontent.append(outdated.pulp2_id) outdated.delete() downloaded = record.downloaded if hasattr(record, 'downloaded') else False item = Pulp2Content(pulp2_id=record.id, pulp2_content_type_id=record._content_type_id, pulp2_last_updated=record._last_updated, pulp2_storage_path=record._storage_path, downloaded=downloaded) _logger.debug( 'Add content item to the list to migrate: {item}'.format( item=item)) pulp2content.append(item) # determine if the batch needs to be saved, also take into account whether there is # anything in the pulp2contant to be saved save_batch = pulp2content and (i and not (i + 1) % batch_size or i == total_content - 1) if save_batch: _logger.debug( 'Bulk save for generic content info, saved so far: {index}'. format(index=i + 1)) pulp2content_batch = Pulp2Content.objects.bulk_create( pulp2content, ignore_conflicts=True) content_saved = len(pulp2content_batch) - existing_count pulp2content_pb.done += content_saved pulp2content_pb.save() await content_model.pulp_2to3_detail.pre_migrate_content_detail( pulp2content_batch) pulp2detail_pb.done += content_saved pulp2detail_pb.save() pulp2content = [] existing_count = 0 if pulp2mutatedcontent: # when we flip the is_migrated flag to False, we base this decision on the last_unit_added # https://github.com/pulp/pulp-2to3-migration/blob/master/pulp_2to3_migration/app/pre_migration.py#L279 # noqa # in this case, we still need to update the is_migrated flag manually because of errata. # in pulp2 sync and copy cases of updated errata are not covered # only when uploading errata last_unit_added is updated on all the repos that contain it mutated_content = Pulp2RepoContent.objects.filter( pulp2_unit_id__in=pulp2mutatedcontent) repo_to_update_ids = set( mutated_content.values_list('pulp2_repository_id', flat=True)) repos_to_update = [] for pulp2repo in Pulp2Repository.objects.filter( pk__in=repo_to_update_ids): pulp2repo.is_migrated = False repos_to_update.append(pulp2repo) Pulp2Repository.objects.bulk_update(objs=repos_to_update, fields=['is_migrated'], batch_size=1000) await pre_migrate_lazycatalog(content_type) pulp2content_pb.state = TASK_STATES.COMPLETED pulp2content_pb.save() pulp2detail_pb.state = TASK_STATES.COMPLETED pulp2detail_pb.save()
async def run(self): """ Build `DeclarativeContent` from the repodata. """ packages_pb = ProgressReport(message='Parsed Packages', code='parsing.packages') errata_pb = ProgressReport(message='Parsed Erratum', code='parsing.errata') modulemd_pb = ProgressReport(message='Parse Modulemd', code='parsing.modulemds') modulemd_defaults_pb = ProgressReport( message='Parse Modulemd-defaults', code='parsing.modulemddefaults') comps_pb = ProgressReport(message='Parsed Comps', code='parsing.comps') packages_pb.save() errata_pb.save() comps_pb.save() remote_url = self.new_url or self.remote.url remote_url = remote_url if remote_url[-1] == "/" else f"{remote_url}/" progress_data = dict(message='Downloading Metadata Files', code='downloading.metadata') with ProgressReport(**progress_data) as metadata_pb: downloader = self.remote.get_downloader( url=urljoin(remote_url, 'repodata/repomd.xml')) # TODO: decide how to distinguish between a mirror list and a normal repo result = await downloader.run() metadata_pb.increment() if self.kickstart: d_artifacts = [] for path, checksum in self.kickstart["download"][ "images"].items(): artifact = Artifact(**checksum) da = DeclarativeArtifact( artifact=artifact, url=urljoin(remote_url, path), relative_path=path, remote=self.remote, deferred_download=self.deferred_download) d_artifacts.append(da) distribution_tree = DistributionTree( **self.kickstart["distribution_tree"]) dc = DeclarativeContent(content=distribution_tree, d_artifacts=d_artifacts) dc.extra_data = self.kickstart await self.put(dc) repomd_path = result.path repomd = cr.Repomd(repomd_path) package_repodata_urls = {} downloaders = [] modulemd_list = list() dc_groups = [] dc_categories = [] dc_environments = [] nevra_to_module = defaultdict(dict) pkgname_to_groups = defaultdict(list) group_to_categories = defaultdict(list) group_to_environments = defaultdict(list) optionalgroup_to_environments = defaultdict(list) modulemd_results = None comps_downloader = None for record in repomd.records: if record.type in PACKAGE_REPODATA: package_repodata_urls[record.type] = urljoin( remote_url, record.location_href) elif record.type in UPDATE_REPODATA: updateinfo_url = urljoin(remote_url, record.location_href) downloader = self.remote.get_downloader(url=updateinfo_url) downloaders.append([downloader.run()]) elif record.type in COMPS_REPODATA: comps_url = urljoin(remote_url, record.location_href) comps_downloader = self.remote.get_downloader( url=comps_url) elif record.type in SKIP_REPODATA: continue elif record.type in MODULAR_REPODATA: modules_url = urljoin(remote_url, record.location_href) modulemd_downloader = self.remote.get_downloader( url=modules_url) modulemd_results = await modulemd_downloader.run() elif record.type not in PACKAGE_DB_REPODATA: file_data = { record.checksum_type: record.checksum, "size": record.size } da = DeclarativeArtifact( artifact=Artifact(**file_data), url=urljoin(remote_url, record.location_href), relative_path=record.location_href, remote=self.remote, deferred_download=False) repo_metadata_file = RepoMetadataFile( data_type=record.type, checksum_type=record.checksum_type, checksum=record.checksum, ) dc = DeclarativeContent(content=repo_metadata_file, d_artifacts=[da]) await self.put(dc) # we have to sync module.yaml first if it exists, to make relations to packages if modulemd_results: modulemd_index = mmdlib.ModuleIndex.new() open_func = gzip.open if modulemd_results.url.endswith( '.gz') else open with open_func(modulemd_results.path, 'r') as moduleyaml: modulemd_index.update_from_string( moduleyaml.read().decode(), True) modulemd_names = modulemd_index.get_module_names() or [] modulemd_all = parse_modulemd(modulemd_names, modulemd_index) modulemd_pb.total = len(modulemd_all) modulemd_pb.state = 'running' modulemd_pb.save() for modulemd in modulemd_all: artifact = modulemd.pop('artifact') relative_path = '{}{}{}{}{}snippet'.format( modulemd[PULP_MODULE_ATTR.NAME], modulemd[PULP_MODULE_ATTR.STREAM], modulemd[PULP_MODULE_ATTR.VERSION], modulemd[PULP_MODULE_ATTR.CONTEXT], modulemd[PULP_MODULE_ATTR.ARCH]) da = DeclarativeArtifact(artifact=artifact, relative_path=relative_path, url=modules_url) modulemd_content = Modulemd(**modulemd) dc = DeclarativeContent(content=modulemd_content, d_artifacts=[da]) dc.extra_data = defaultdict(list) # dc.content.artifacts are Modulemd artifacts for artifact in json.loads(dc.content.artifacts): nevra_to_module.setdefault(artifact, set()).add(dc) modulemd_list.append(dc) modulemd_default_names = parse_defaults(modulemd_index) modulemd_defaults_pb.total = len(modulemd_default_names) modulemd_defaults_pb.state = 'running' modulemd_defaults_pb.save() for default in modulemd_default_names: artifact = default.pop('artifact') relative_path = '{}{}snippet'.format( default[PULP_MODULEDEFAULTS_ATTR.MODULE], default[PULP_MODULEDEFAULTS_ATTR.STREAM]) da = DeclarativeArtifact(artifact=artifact, relative_path=relative_path, url=modules_url) default_content = ModulemdDefaults(**default) modulemd_defaults_pb.increment() dc = DeclarativeContent(content=default_content, d_artifacts=[da]) await self.put(dc) if comps_downloader: comps_result = await comps_downloader.run() comps = libcomps.Comps() comps.fromxml_f(comps_result.path) comps_pb.total = (len(comps.groups) + len(comps.categories) + len(comps.environments)) comps_pb.state = 'running' comps_pb.save() if comps.langpacks: langpack_dict = PackageLangpacks.libcomps_to_dict( comps.langpacks) packagelangpack = PackageLangpacks( matches=strdict_to_dict(comps.langpacks), digest=dict_digest(langpack_dict)) dc = DeclarativeContent(content=packagelangpack) dc.extra_data = defaultdict(list) await self.put(dc) if comps.categories: for category in comps.categories: category_dict = PackageCategory.libcomps_to_dict( category) category_dict['digest'] = dict_digest(category_dict) packagecategory = PackageCategory(**category_dict) dc = DeclarativeContent(content=packagecategory) dc.extra_data = defaultdict(list) if packagecategory.group_ids: for group_id in packagecategory.group_ids: group_to_categories[group_id['name']].append( dc) dc_categories.append(dc) if comps.environments: for environment in comps.environments: environment_dict = PackageEnvironment.libcomps_to_dict( environment) environment_dict['digest'] = dict_digest( environment_dict) packageenvironment = PackageEnvironment( **environment_dict) dc = DeclarativeContent(content=packageenvironment) dc.extra_data = defaultdict(list) if packageenvironment.option_ids: for option_id in packageenvironment.option_ids: optionalgroup_to_environments[ option_id['name']].append(dc) if packageenvironment.group_ids: for group_id in packageenvironment.group_ids: group_to_environments[group_id['name']].append( dc) dc_environments.append(dc) if comps.groups: for group in comps.groups: group_dict = PackageGroup.libcomps_to_dict(group) group_dict['digest'] = dict_digest(group_dict) packagegroup = PackageGroup(**group_dict) dc = DeclarativeContent(content=packagegroup) dc.extra_data = defaultdict(list) if packagegroup.packages: for package in packagegroup.packages: pkgname_to_groups[package['name']].append(dc) if dc.content.id in group_to_categories.keys(): for dc_category in group_to_categories[ dc.content.id]: dc.extra_data['category_relations'].append( dc_category) dc_category.extra_data['packagegroups'].append( dc) if dc.content.id in group_to_environments.keys(): for dc_environment in group_to_environments[ dc.content.id]: dc.extra_data['environment_relations'].append( dc_environment) dc_environment.extra_data[ 'packagegroups'].append(dc) if dc.content.id in optionalgroup_to_environments.keys( ): for dc_environment in optionalgroup_to_environments[ dc.content.id]: dc.extra_data['env_relations_optional'].append( dc_environment) dc_environment.extra_data[ 'optionalgroups'].append(dc) dc_groups.append(dc) for dc_category in dc_categories: comps_pb.increment() await self.put(dc_category) for dc_environment in dc_environments: comps_pb.increment() await self.put(dc_environment) # to preserve order, downloaders are created after all repodata urls are identified package_repodata_downloaders = [] for repodata_type in PACKAGE_REPODATA: downloader = self.remote.get_downloader( url=package_repodata_urls[repodata_type]) package_repodata_downloaders.append(downloader.run()) downloaders.append(package_repodata_downloaders) # asyncio.gather is used to preserve the order of results for package repodata pending = [ asyncio.gather(*downloaders_group) for downloaders_group in downloaders ] while pending: done, pending = await asyncio.wait( pending, return_when=asyncio.FIRST_COMPLETED) for downloader in done: results = downloader.result() if results[0].url == package_repodata_urls['primary']: primary_xml_path = results[0].path filelists_xml_path = results[1].path other_xml_path = results[2].path metadata_pb.done += 3 metadata_pb.save() packages = await RpmFirstStage.parse_repodata( primary_xml_path, filelists_xml_path, other_xml_path) packages_pb.total = len(packages) packages_pb.state = 'running' packages_pb.save() for pkg in packages.values(): package = Package( **Package.createrepo_to_dict(pkg)) artifact = Artifact(size=package.size_package) checksum_type = getattr( CHECKSUM_TYPES, package.checksum_type.upper()) setattr(artifact, checksum_type, package.pkgId) url = urljoin(remote_url, package.location_href) filename = os.path.basename(package.location_href) da = DeclarativeArtifact( artifact=artifact, url=url, relative_path=filename, remote=self.remote, deferred_download=self.deferred_download) dc = DeclarativeContent(content=package, d_artifacts=[da]) dc.extra_data = defaultdict(list) # find if a package relates to a modulemd if dc.content.nevra in nevra_to_module.keys(): dc.content.is_modular = True for dc_modulemd in nevra_to_module[ dc.content.nevra]: dc.extra_data['modulemd_relation'].append( dc_modulemd) dc_modulemd.extra_data[ 'package_relation'].append(dc) if dc.content.name in pkgname_to_groups.keys(): for dc_group in pkgname_to_groups[ dc.content.name]: dc.extra_data['group_relations'].append( dc_group) dc_group.extra_data[ 'related_packages'].append(dc) packages_pb.increment() await self.put(dc) elif results[0].url == updateinfo_url: updateinfo_xml_path = results[0].path metadata_pb.increment() updates = await RpmFirstStage.parse_updateinfo( updateinfo_xml_path) errata_pb.total = len(updates) errata_pb.state = 'running' errata_pb.save() for update in updates: update_record = UpdateRecord( **UpdateRecord.createrepo_to_dict(update)) update_record.digest = RpmFirstStage.hash_update_record( update) future_relations = { 'collections': defaultdict(list), 'references': [] } for collection in update.collections: coll_dict = UpdateCollection.createrepo_to_dict( collection) coll = UpdateCollection(**coll_dict) for package in collection.packages: pkg_dict = UpdateCollectionPackage.createrepo_to_dict( package) pkg = UpdateCollectionPackage(**pkg_dict) future_relations['collections'][ coll].append(pkg) for reference in update.references: reference_dict = UpdateReference.createrepo_to_dict( reference) ref = UpdateReference(**reference_dict) future_relations['references'].append(ref) errata_pb.increment() dc = DeclarativeContent(content=update_record) dc.extra_data = future_relations await self.put(dc) # now send modules down the pipeline since all relations have been set up for modulemd in modulemd_list: modulemd_pb.increment() await self.put(modulemd) for dc_group in dc_groups: comps_pb.increment() await self.put(dc_group) packages_pb.state = 'completed' errata_pb.state = 'completed' modulemd_pb.state = 'completed' modulemd_defaults_pb.state = 'completed' comps_pb.state = 'completed' packages_pb.save() errata_pb.save() modulemd_pb.save() modulemd_defaults_pb.save() comps_pb.save()
async def pre_migrate_content(content_model): """ A coroutine to pre-migrate Pulp 2 content. Args: content_model: Models for content which is being migrated. """ batch_size = 10000 content_type = content_model.pulp2.type pulp2content = [] # the latest timestamp we have in the migration tool Pulp2Content table for this content type content_qs = Pulp2Content.objects.filter( pulp2_content_type_id=content_type) last_updated = content_qs.aggregate( Max('pulp2_last_updated'))['pulp2_last_updated__max'] or 0 _logger.debug( 'The latest migrated {type} content has {timestamp} timestamp.'.format( type=content_type, timestamp=last_updated)) # query only newly created/updated items mongo_content_qs = content_model.pulp2.objects( _last_updated__gte=last_updated) total_content = mongo_content_qs.count() _logger.debug('Total count for {type} content to migrate: {total}'.format( type=content_type, total=total_content)) pulp2content_pb = ProgressReport( message='Pre-migrating Pulp 2 {} content (general info)'.format( content_type.upper()), code='premigrating.content.general', total=total_content, state=TASK_STATES.RUNNING) pulp2content_pb.save() pulp2detail_pb = ProgressReport( message='Pre-migrating Pulp 2 {} content (detail info)'.format( content_type.upper()), code='premigrating.content.detail', total=total_content, state=TASK_STATES.RUNNING) pulp2detail_pb.save() existing_count = 0 fields = set(['id', '_storage_path', '_last_updated', '_content_type_id']) if hasattr(content_model.pulp2, 'downloaded'): fields.add('downloaded') for i, record in enumerate( mongo_content_qs.only(*fields).batch_size(batch_size)): if record._last_updated == last_updated: # corner case - content with the last``last_updated`` date might be pre-migrated; # check if this content is already pre-migrated migrated = Pulp2Content.objects.filter( pulp2_last_updated=last_updated, pulp2_id=record.id) if migrated: existing_count += 1 # it has to be updated here and not later, in case all items were migrated before # and no new content will be saved. pulp2content_pb.total -= 1 pulp2content_pb.save() pulp2detail_pb.total -= 1 pulp2detail_pb.save() continue downloaded = record.downloaded if hasattr(record, 'downloaded') else False item = Pulp2Content(pulp2_id=record.id, pulp2_content_type_id=record._content_type_id, pulp2_last_updated=record._last_updated, pulp2_storage_path=record._storage_path, downloaded=downloaded) _logger.debug('Add content item to the list to migrate: {item}'.format( item=item)) pulp2content.append(item) save_batch = (i and not (i + 1) % batch_size or i == total_content - 1) if save_batch: _logger.debug( 'Bulk save for generic content info, saved so far: {index}'. format(index=i + 1)) pulp2content_batch = Pulp2Content.objects.bulk_create( pulp2content, ignore_conflicts=True) content_saved = len(pulp2content_batch) - existing_count pulp2content_pb.done += content_saved pulp2content_pb.save() await content_model.pulp_2to3_detail.pre_migrate_content_detail( pulp2content_batch) pulp2detail_pb.done += content_saved pulp2detail_pb.save() pulp2content = [] existing_count = 0 pulp2content_pb.state = TASK_STATES.COMPLETED pulp2content_pb.save() pulp2detail_pb.state = TASK_STATES.COMPLETED pulp2detail_pb.save()
def pre_migrate_content_type(content_model, mutable_type, lazy_type, premigrate_hook): """ A coroutine to pre-migrate Pulp 2 content, including all details for on_demand content. Args: content_model: Models for content which is being migrated. mutable_type: Boolean that indicates whether the content type is mutable. """ def delete_removed_pulp2_content(content_model): """ Delete Pulp2Content records for content which is no longer present in Pulp2. This is to avoid situations and extra work when not all content migrated during the first migration run, then orphan clean up is run in Pulp 2, and then migration is run again. Args: content_model: Pulp 2 content model """ content_type = content_model.pulp2.TYPE_ID mongo_content_qs = content_model.pulp2.objects().only('id') mongo_content_ids = {c['_id'] for c in mongo_content_qs.as_pymongo().no_cache()} premigrated_content_ids = set( Pulp2Content.objects.filter( pulp2_content_type_id=content_type ).only('pulp2_id').values_list('pulp2_id', flat=True) ) content_ids_to_delete = premigrated_content_ids - mongo_content_ids if content_ids_to_delete: Pulp2Content.objects.filter( pulp2_content_type_id=content_type, pulp2_id__in=content_ids_to_delete ).delete() batch_size = settings.CONTENT_PREMIGRATION_BATCH_SIZE or DEFAULT_BATCH_SIZE pulp2content = [] pulp2mutatedcontent = [] content_type = content_model.pulp2.TYPE_ID set_pulp2_repo = content_model.pulp_2to3_detail.set_pulp2_repo delete_removed_pulp2_content(content_model) # the latest timestamp we have in the migration tool Pulp2Content table for this content type content_qs = Pulp2Content.objects.filter(pulp2_content_type_id=content_type) last_updated = content_qs.aggregate(Max('pulp2_last_updated'))['pulp2_last_updated__max'] or 0 _logger.debug('The latest migrated {type} content has {timestamp} timestamp.'.format( type=content_type, timestamp=last_updated)) query_args = {} if premigrate_hook: pulp2_content_ids = premigrate_hook() query_args["id__in"] = pulp2_content_ids mongo_content_qs = content_model.pulp2.objects( _last_updated__gte=last_updated, **query_args ).order_by("_last_updated") total_content = mongo_content_qs.count() _logger.debug('Total count for {type} content to migrate: {total}'.format( type=content_type, total=total_content)) pulp2content_pb = ProgressReport( message='Pre-migrating Pulp 2 {} content (general info)'.format(content_type), code='premigrating.content.general', total=total_content, state=TASK_STATES.RUNNING) pulp2content_pb.save() pulp2detail_pb = ProgressReport( message='Pre-migrating Pulp 2 {} content (detail info)'.format(content_type), code='premigrating.content.detail', total=total_content, state=TASK_STATES.RUNNING) pulp2detail_pb.save() existing_count = 0 if mutable_type: pulp2_content_ids = [] for c in mongo_content_qs.only('id', '_last_updated').no_cache().as_pymongo(): if c['_last_updated'] == last_updated: if Pulp2Content.objects.filter( pulp2_last_updated=last_updated, pulp2_id=c['_id']).exists(): continue pulp2_content_ids.append(c['_id']) # This is a mutable content type. Query for the existing pulp2content. # If any was found, it means that the migrated content is older than the incoming. # Delete outdated migrated pulp2content and create a new pulp2content outdated = Pulp2Content.objects.filter(pulp2_id__in=pulp2_content_ids) if outdated.exists(): pulp2mutatedcontent.extend(pulp2_content_ids) outdated.delete() mongo_fields = set(['id', '_storage_path', '_last_updated', '_content_type_id']) if hasattr(content_model.pulp2, 'downloaded'): mongo_fields.add('downloaded') batched_mongo_content_qs = mongo_content_qs.only(*mongo_fields).batch_size(batch_size) for i, record in enumerate(batched_mongo_content_qs.no_cache()): if record._last_updated == last_updated: # corner case - content with the last``last_updated`` date might be pre-migrated; # check if this content is already pre-migrated migrated = Pulp2Content.objects.filter(pulp2_last_updated=last_updated, pulp2_id=record.id) if migrated.exists(): existing_count += 1 # it has to be updated here and not later, in case all items were migrated before # and no new content will be saved. pulp2content_pb.total -= 1 pulp2detail_pb.total -= 1 continue downloaded = record.downloaded if hasattr(record, 'downloaded') else False if set_pulp2_repo: # This content requires to set pulp 2 repo. E.g. for errata, because 1 pulp2 # content unit is converted into N pulp3 content units and repo_id is the only # way to have unique records for those. content_relations = Pulp2RepoContent.objects.filter( pulp2_unit_id=record.id, pulp2_content_type_id=record._content_type_id, pulp2_repository__not_in_plan=False, ).select_related( 'pulp2_repository' ).only( 'pulp2_repository' ) for relation in content_relations.iterator(): item = Pulp2Content( pulp2_id=record.id, pulp2_content_type_id=record._content_type_id, pulp2_last_updated=record._last_updated, pulp2_storage_path=record._storage_path, downloaded=downloaded, pulp2_repo=relation.pulp2_repository, ) _logger.debug( 'Add content item to the list to migrate: {item}'.format(item=item)) pulp2content.append(item) pulp2content_pb.total += 1 pulp2detail_pb.total += 1 # total needs to be adjusted, proper counting happened in the loop above, # so we subtract one because this content is also a part of initial 'total' counter. pulp2content_pb.total -= 1 pulp2detail_pb.total -= 1 else: item = Pulp2Content( pulp2_id=record.id, pulp2_content_type_id=record._content_type_id, pulp2_last_updated=record._last_updated, pulp2_storage_path=record._storage_path, downloaded=downloaded ) _logger.debug('Add content item to the list to migrate: {item}'.format(item=item)) pulp2content.append(item) # determine if the batch needs to be saved, also take into account whether there is # anything in the pulp2content to be saved save_batch = pulp2content and (len(pulp2content) >= batch_size or i == total_content - 1) if save_batch: _logger.debug( 'Bulk save for generic content info, saved so far: {index}'.format(index=i + 1) ) pulp2content_batch = Pulp2Content.objects.bulk_create(pulp2content, ignore_conflicts=True) # bulk_create(ignore_conflicts=True) hands back the same item-set we passed in, # *even if* it decided to update an existing db-record rather than creating a new # one with the passed-in PK. As a result, we can't trust pulp2content_batch to # have the 'right' PKs (i.e., the in-memory p2content_batch doesn't match the # db-reality). This causes the pre_migrate_content_detail() below to fail as it # attempts to create detail-records for the Pulp2Content records it's been handed. # THEREFORE - we need to find the 'real' IDs of everything in p2content-batch based # on its uniqueness-fields and update the in-memory list with them. for p2c in pulp2content_batch: filter_q = Q( pulp2_content_type_id=content_type, pulp2_id=p2c.pulp2_id, pulp2_repo=p2c.pulp2_repo, pulp2_subid=p2c.pulp2_subid, ) p2c_db = Pulp2Content.objects.get(filter_q) p2c.pulp_id = p2c_db.pulp_id content_saved = len(pulp2content_batch) - existing_count pulp2content_pb.done += content_saved pulp2content_pb.save() content_model.pulp_2to3_detail.pre_migrate_content_detail(pulp2content_batch) pulp2detail_pb.done += content_saved pulp2detail_pb.save() pulp2content.clear() existing_count = 0 # If it's a per-repo content type and it's a migration re-run, we need to make sure that the # existing content hasn't been associated with a new repo since our last migration, # and if so, we need to go back and create a Pulp2Content for these new relations. # E.g. errata copied from one repo to another in Pulp 2, in such cases _last_updated is # unchanged. if set_pulp2_repo and last_updated: # last_updated is a unix timestamp, we need to convert it to use in our Django query. last_updated = datetime.utcfromtimestamp(last_updated) # Query all new relations for that content since the last run content_relations = Pulp2RepoContent.objects.filter( pulp2_content_type_id=content_type, pulp2_repository__not_in_plan=False, pulp2_created__gte=last_updated ).select_related( 'pulp2_repository' ).only( 'pulp2_repository', 'pulp2_created', ).order_by('pulp2_created') mongo_content_qs = content_model.pulp2.objects( id__in=content_relations.values_list('pulp2_unit_id', flat=True)) pulp2_content_by_id = { record.id: record for record in mongo_content_qs.only(*mongo_fields).no_cache() } for relation in content_relations: record = pulp2_content_by_id[relation.pulp2_unit_id] downloaded = record.downloaded if hasattr(record, 'downloaded') else False specific_content_q = Q( pulp2_content_type_id=record._content_type_id, pulp2_id=record.id, pulp2_repo=relation.pulp2_repository, pulp2_subid='', ) # Ensure that no existing pulp2content slipped into bulk_create. # Otherwise, we'll have a problem with later bulk_create for detail models. if Pulp2Content.objects.filter(specific_content_q).exists(): continue item = Pulp2Content( pulp2_id=record.id, pulp2_content_type_id=record._content_type_id, # Set `pulp2_last_updated` to the date of when a content unit got copied. # (We can't set it to anything higher, in case pre-migration crashes and we would # need to pick it up correctly on the next re-run.) # When erratum is copied in pulp 2, it doesn't change its _last_updated timestamp. # It means that Katello has no way to identify that the erratum has been copied # since the last migration run, without reimporting all errata, which is expensive. pulp2_last_updated=int(relation.pulp2_created.timestamp()), pulp2_storage_path=record._storage_path, downloaded=downloaded, pulp2_repo=relation.pulp2_repository ) _logger.debug( 'Add content item to the list to migrate: {item}'.format(item=item)) pulp2content.append(item) pulp2content_pb.total += 1 pulp2detail_pb.total += 1 pulp2content_batch = Pulp2Content.objects.bulk_create(pulp2content) pulp2content_pb.done += len(pulp2content_batch) pulp2content_pb.save() content_model.pulp_2to3_detail.pre_migrate_content_detail(pulp2content_batch) pulp2detail_pb.done += len(pulp2content_batch) pulp2detail_pb.save() pulp2content_pb.save() pulp2detail_pb.save() if pulp2mutatedcontent: # when we flip the is_migrated flag to False, we base this decision on the last_unit_added # https://github.com/pulp/pulp-2to3-migration/blob/master/pulp_2to3_migration/app/pre_migration.py#L279 # noqa # in this case, we still need to update the is_migrated flag manually because of errata. # in pulp2 sync and copy cases of updated errata are not covered # only when uploading errata last_unit_added is updated on all the repos that contain it mutated_content = Pulp2RepoContent.objects.filter(pulp2_unit_id__in=pulp2mutatedcontent) repo_to_update_ids = mutated_content.values_list( 'pulp2_repository_id', flat=True).distinct() Pulp2Repository.objects.filter(pk__in=repo_to_update_ids).update(is_migrated=False) if lazy_type: pre_migrate_lazycatalog(content_type) pulp2content_pb.state = TASK_STATES.COMPLETED pulp2content_pb.save() pulp2detail_pb.state = TASK_STATES.COMPLETED pulp2detail_pb.save()
def pre_migrate_content_type(content_model, mutable_type, lazy_type, premigrate_hook): """ A coroutine to pre-migrate Pulp 2 content, including all details for on_demand content. Args: content_model: Models for content which is being migrated. mutable_type: Boolean that indicates whether the content type is mutable. """ batch_size = 100 pulp2content = [] pulp2mutatedcontent = [] content_type = content_model.pulp2.TYPE_ID set_pulp2_repo = content_model.pulp_2to3_detail.set_pulp2_repo # the latest timestamp we have in the migration tool Pulp2Content table for this content type content_qs = Pulp2Content.objects.filter( pulp2_content_type_id=content_type) last_updated = content_qs.aggregate( Max('pulp2_last_updated'))['pulp2_last_updated__max'] or 0 _logger.debug( 'The latest migrated {type} content has {timestamp} timestamp.'.format( type=content_type, timestamp=last_updated)) query_args = {} if premigrate_hook: pulp2_content_ids = premigrate_hook() query_args["id__in"] = pulp2_content_ids mongo_content_qs = content_model.pulp2.objects( _last_updated__gte=last_updated, **query_args).order_by("_last_updated") total_content = mongo_content_qs.count() _logger.debug('Total count for {type} content to migrate: {total}'.format( type=content_type, total=total_content)) pulp2content_pb = ProgressReport( message='Pre-migrating Pulp 2 {} content (general info)'.format( content_type.upper()), code='premigrating.content.general', total=total_content, state=TASK_STATES.RUNNING) pulp2content_pb.save() pulp2detail_pb = ProgressReport( message='Pre-migrating Pulp 2 {} content (detail info)'.format( content_type.upper()), code='premigrating.content.detail', total=total_content, state=TASK_STATES.RUNNING) pulp2detail_pb.save() existing_count = 0 if mutable_type: pulp2_content_ids = [] for c in mongo_content_qs.only( 'id', '_last_updated').no_cache().as_pymongo(): if c['_last_updated'] == last_updated: if Pulp2Content.objects.filter(pulp2_last_updated=last_updated, pulp2_id=c['_id']).exists(): continue pulp2_content_ids.append(c['_id']) # This is a mutable content type. Query for the existing pulp2content. # If any was found, it means that the migrated content is older than the incoming. # Delete outdated migrated pulp2content and create a new pulp2content outdated = Pulp2Content.objects.filter(pulp2_id__in=pulp2_content_ids) if outdated.exists(): pulp2mutatedcontent.extend(pulp2_content_ids) outdated.delete() mongo_fields = set( ['id', '_storage_path', '_last_updated', '_content_type_id']) if hasattr(content_model.pulp2, 'downloaded'): mongo_fields.add('downloaded') batched_mongo_content_qs = mongo_content_qs.only( *mongo_fields).batch_size(batch_size) for i, record in enumerate(batched_mongo_content_qs.no_cache()): if record._last_updated == last_updated: # corner case - content with the last``last_updated`` date might be pre-migrated; # check if this content is already pre-migrated migrated = Pulp2Content.objects.filter( pulp2_last_updated=last_updated, pulp2_id=record.id) if migrated.exists(): existing_count += 1 # it has to be updated here and not later, in case all items were migrated before # and no new content will be saved. pulp2content_pb.total -= 1 pulp2detail_pb.total -= 1 continue downloaded = record.downloaded if hasattr(record, 'downloaded') else False if set_pulp2_repo: # This content requires to set pulp 2 repo. E.g. for errata, because 1 pulp2 # content unit is converted into N pulp3 content units and repo_id is the only # way to have unique records for those. content_relations = Pulp2RepoContent.objects.filter( pulp2_unit_id=record.id, pulp2_content_type_id=record._content_type_id, pulp2_repository__not_in_plan=False, ).select_related('pulp2_repository').only('pulp2_repository') for relation in content_relations.iterator(): item = Pulp2Content( pulp2_id=record.id, pulp2_content_type_id=record._content_type_id, pulp2_last_updated=record._last_updated, pulp2_storage_path=record._storage_path, downloaded=downloaded, pulp2_repo=relation.pulp2_repository) _logger.debug( 'Add content item to the list to migrate: {item}'.format( item=item)) pulp2content.append(item) pulp2content_pb.total += 1 pulp2detail_pb.total += 1 # total needs to be adjusted, proper counting happened in the loop above, # so we subtract one because this content is also a part of initial 'total' counter. pulp2content_pb.total -= 1 pulp2detail_pb.total -= 1 else: item = Pulp2Content(pulp2_id=record.id, pulp2_content_type_id=record._content_type_id, pulp2_last_updated=record._last_updated, pulp2_storage_path=record._storage_path, downloaded=downloaded) _logger.debug( 'Add content item to the list to migrate: {item}'.format( item=item)) pulp2content.append(item) # determine if the batch needs to be saved, also take into account whether there is # anything in the pulp2content to be saved save_batch = pulp2content and (len(pulp2content) >= batch_size or i == total_content - 1) if save_batch: _logger.debug( 'Bulk save for generic content info, saved so far: {index}'. format(index=i + 1)) pulp2content_batch = Pulp2Content.objects.bulk_create( pulp2content, ignore_conflicts=True) content_saved = len(pulp2content_batch) - existing_count pulp2content_pb.done += content_saved pulp2content_pb.save() content_model.pulp_2to3_detail.pre_migrate_content_detail( pulp2content_batch) pulp2detail_pb.done += content_saved pulp2detail_pb.save() pulp2content.clear() existing_count = 0 pulp2content_pb.save() pulp2detail_pb.save() if pulp2mutatedcontent: # when we flip the is_migrated flag to False, we base this decision on the last_unit_added # https://github.com/pulp/pulp-2to3-migration/blob/master/pulp_2to3_migration/app/pre_migration.py#L279 # noqa # in this case, we still need to update the is_migrated flag manually because of errata. # in pulp2 sync and copy cases of updated errata are not covered # only when uploading errata last_unit_added is updated on all the repos that contain it mutated_content = Pulp2RepoContent.objects.filter( pulp2_unit_id__in=pulp2mutatedcontent) repo_to_update_ids = mutated_content.values_list('pulp2_repository_id', flat=True).distinct() Pulp2Repository.objects.filter(pk__in=repo_to_update_ids).update( is_migrated=False) if lazy_type: pre_migrate_lazycatalog(content_type) pulp2content_pb.state = TASK_STATES.COMPLETED pulp2content_pb.save() pulp2detail_pb.state = TASK_STATES.COMPLETED pulp2detail_pb.save()