async def _repair_artifacts_for_content(subset=None, verify_checksums=True): loop = asyncio.get_event_loop() pending = set() query_set = models.ContentArtifact.objects.exclude(artifact__isnull=True) if subset: query_set = query_set.filter(content__in=subset) with ProgressReport( message="Identify missing units", code="repair.missing") as missing, ProgressReport( message="Identify corrupted units", code="repair.corrupted") as corrupted, ProgressReport( message="Repair corrupted units", code="repair.repaired") as repaired: with ThreadPoolExecutor(max_workers=2) as checksum_executor: for content_artifact in query_set.select_related( "artifact").iterator(): artifact = content_artifact.artifact valid = await loop.run_in_executor(None, default_storage.exists, artifact.file.name) if not valid: missing.increment() log.warn(_("Missing file for {}").format(artifact)) elif verify_checksums: # default ThreadPoolExecutor uses num cores x 5 threads. Since we're doing # such long and sequential reads, using too many threads might hurt more # than help (on HDDs, maybe not on SSDs) by making the disk access pattern # more random. Put it in a separate executor with limited threads. # Should stay in (an) executor so that at least it doesn't completely block # downloads. valid = await loop.run_in_executor(checksum_executor, _verify_artifact, artifact) if not valid: corrupted.increment() log.warn(_("Digest mismatch for {}").format(artifact)) if not valid: if len( pending ) >= 5: # Limit the number of concurrent repair tasks done, pending = await asyncio.wait( pending, return_when=asyncio.FIRST_COMPLETED) await asyncio.gather(*done) # Clean up tasks pending.add( asyncio.ensure_future( _repair_ca(content_artifact, repaired))) await asyncio.gather(*pending)
def validate_toc(toc_filename): """ Check validity of table-of-contents file. table-of-contents must: * exist * be valid JSON * point to chunked-export-files that exist 'next to' the 'toc' file * point to chunks whose checksums match the checksums stored in the 'toc' file Args: toc_filename (str): The user-provided toc-file-path to be validated. Raises: ValidationError: If toc is not a valid JSON table-of-contents file, or when toc points to chunked-export-files that can't be found in the same directory as the toc-file, or the checksums of the chunks do not match the checksums stored in toc. """ with open(toc_filename) as json_file: # Valid JSON? the_toc = json.load(json_file) if not the_toc.get("files", None) or not the_toc.get("meta", None): raise ValidationError(_("Missing 'files' or 'meta' keys in table-of-contents!")) base_dir = os.path.dirname(toc_filename) # Points at chunks that exist? missing_files = [] for f in sorted(the_toc["files"].keys()): if not os.path.isfile(os.path.join(base_dir, f)): missing_files.append(f) if missing_files: raise ValidationError( _( "Missing import-chunks named in table-of-contents: {}.".format( str(missing_files) ) ) ) errs = [] # validate the sha256 of the toc-entries # gather errors for reporting at the end chunks = sorted(the_toc["files"].keys()) data = dict(message="Validating Chunks", code="validate.chunks", total=len(chunks)) with ProgressReport(**data) as pb: for chunk in pb.iter(chunks): a_hash = _compute_hash(os.path.join(base_dir, chunk)) if not a_hash == the_toc["files"][chunk]: err_str = "File {} expected checksum : {}, computed checksum : {}".format( chunk, the_toc["files"][chunk], a_hash ) errs.append(err_str) # if there are any errors, report and fail if errs: raise ValidationError(_("Import chunk hash mismatch: {}).").format(str(errs))) return the_toc
def validate_and_assemble(toc_filename): """Validate checksums of, and reassemble, chunks in table-of-contents file.""" the_toc = validate_toc(toc_filename) toc_dir = os.path.dirname(toc_filename) result_file = os.path.join(toc_dir, the_toc["meta"]["file"]) # if we have only one entry in "files", it must be the full .tar.gz - return it if len(the_toc["files"]) == 1: return os.path.join(toc_dir, list(the_toc["files"].keys())[0]) # We have multiple chunks. # reassemble into one file 'next to' the toc and return the resulting full-path chunk_size = int(the_toc["meta"]["chunk_size"]) offset = 0 block_size = 1024 blocks_per_chunk = int(chunk_size / block_size) # sorting-by-filename is REALLY IMPORTANT here # keys are of the form <base-export-name>.00..<base-export-name>.NN, # and must be reassembled IN ORDER the_chunk_files = sorted(the_toc["files"].keys()) data = dict(message="Recombining Chunks", code="recombine.chunks", total=len(the_chunk_files)) with ProgressReport(**data) as pb: for chunk in pb.iter(the_chunk_files): # For each chunk, add it to the reconstituted tar.gz, picking up where the previous # chunk left off subprocess.run([ "dd", "if={}".format(os.path.join(toc_dir, chunk)), "of={}".format(result_file), "bs={}".format(str(block_size)), "seek={}".format(str(offset)), ], ) offset += blocks_per_chunk # To keep from taking up All The Disk, we delete each chunk after it has been added # to the recombined file. try: subprocess.run(["rm", "-f", os.path.join(toc_dir, chunk)]) except OSError: log.warning( _("Failed to remove chunk {} after recombining. Continuing." ).format(os.path.join(toc_dir, chunk)), exc_info=True, ) combined_hash = _compute_hash(result_file) if combined_hash != the_toc["meta"]["global_hash"]: raise ValidationError( _("Mismatch between combined .tar.gz checksum [{}] and originating [{}])." ).format(combined_hash, the_toc["meta"]["global_hash"])) # if we get this far, then: the chunk-files all existed, they all pass checksum validation, # and there exists a combined .tar.gz, which *also* passes checksum-validation. # Let the rest of the import process do its thing on the new combined-file. return result_file
def purge(finished_before, states): """ This task purges from the database records of tasks which finished prior to the specified time. It will remove only tasks that are 'owned' by the current-user (admin-users own All The Things, so admins can delete all tasks). It will not remove tasks that are incomplete (ie, in states running|waiting|cancelling). It reports (using ProgressReport) the total entities deleted, as well as individual counts for each class of entity. This shows the results of cascading-deletes that are triggered by deleting a Task. Args: finished_before (DateTime): Earliest finished-time to **NOT** purge. states (List[str]): List of task-states we want to purge. """ current_user = get_current_authenticated_user() qs = Task.objects.filter(finished_at__lt=finished_before, state__in=states) units_deleted, details = get_objects_for_user(current_user, "core.delete_task", qs=qs).delete() # Progress bar reporting total-units progress_bar = ProgressReport( message=_("Purged task-objects total"), total=units_deleted, code="purge.tasks.total", done=units_deleted, state="completed", ) progress_bar.save() # This loop reports back the specific entities deleted and the number removed for key in details: progress_bar = ProgressReport( message=_("Purged task-objects of type {}".format(key)), total=details[key], code="purge.tasks.key.{}".format(key), done=details[key], state="completed", ) progress_bar.save()
def import_repository_version(importer_pk, destination_repo_pk, source_repo_name, tar_path): """ Import a repository version from a Pulp export. Args: importer_pk (str): Importer we are working with destination_repo_pk (str): Primary key of Repository to import into. source_repo_name (str): Name of the Repository in the export. tar_path (str): A path to export tar. """ dest_repo = Repository.objects.get(pk=destination_repo_pk) importer = PulpImporter.objects.get(pk=importer_pk) pb = ProgressReport( message=f"Importing content for {dest_repo.name}", code="import.repo.version.content", state=TASK_STATES.RUNNING, ) pb.save() with tempfile.TemporaryDirectory() as temp_dir: # Extract the repo file for the repo info with tarfile.open(tar_path, "r:gz") as tar: tar.extract(REPO_FILE, path=temp_dir) with open(os.path.join(temp_dir, REPO_FILE), "r") as repo_data_file: data = json.load(repo_data_file) src_repo = next(repo for repo in data if repo["name"] == source_repo_name) if dest_repo.pulp_type != src_repo["pulp_type"]: raise ValidationError( _("Repository type mismatch: {src_repo} ({src_type}) vs {dest_repo} " "({dest_type}).").format( src_repo=src_repo["name"], src_type=src_repo["pulp_type"], dest_repo=dest_repo.name, dest_type=dest_repo.pulp_type, )) rv_name = "" # Extract the repo version files with tarfile.open(tar_path, "r:gz") as tar: for mem in tar.getmembers(): match = re.search( fr"(^repository-{source_repo_name}_[0-9]+)/.+", mem.name) if match: rv_name = match.group(1) tar.extract(mem, path=temp_dir) if not rv_name: raise ValidationError( _("No RepositoryVersion found for {}").format(rv_name)) rv_path = os.path.join(temp_dir, rv_name) # Content plugin_name = src_repo["pulp_type"].split(".")[0] cfg = get_plugin_config(plugin_name) resulting_content_ids = [] for res_class in cfg.exportable_classes: filename = f"{res_class.__module__}.{res_class.__name__}.json" a_result = _import_file(os.path.join(rv_path, filename), res_class, do_raise=False) # django import-export can have a problem with concurrent-imports that are # importing the same 'thing' (e.g., a Package that exists in two different # repo-versions that are being imported at the same time). We will try an import # that will simply record errors as they happen (rather than failing with an exception) # first. If errors happen, we'll do one retry before we give up on this repo-version's # import. if a_result.has_errors(): log.info( _("...{} import-errors encountered importing {} from {}, retrying" ).format(a_result.totals["error"], filename, rv_name)) # Second attempt, we allow to raise an exception on any problem. # This will either succeed, or log a fatal error and fail. try: a_result = _import_file(os.path.join(rv_path, filename), res_class) except Exception as e: # noqa log on ANY exception and then re-raise log.error( _("FATAL import-failure importing {} from {}").format( filename, rv_name)) raise resulting_content_ids.extend(row.object_id for row in a_result.rows if row.import_type in ("new", "update")) # Once all content exists, create the ContentArtifact links ca_path = os.path.join(rv_path, CA_FILE) _import_file(ca_path, ContentArtifactResource) # see if we have a content mapping mapping_path = f"{rv_name}/{CONTENT_MAPPING_FILE}" mapping = {} with tarfile.open(tar_path, "r:gz") as tar: if mapping_path in tar.getnames(): tar.extract(mapping_path, path=temp_dir) with open(os.path.join(temp_dir, mapping_path), "r") as mapping_file: mapping = json.load(mapping_file) if mapping: # use the content mapping to map content to repos for repo_name, content_ids in mapping.items(): repo = _destination_repo(importer, repo_name) content = Content.objects.filter(upstream_id__in=content_ids) with repo.new_version() as new_version: new_version.set_content(content) else: # just map all the content to our destination repo content = Content.objects.filter(pk__in=resulting_content_ids) with dest_repo.new_version() as new_version: new_version.set_content(content) content_count = content.count() pb.total = content_count pb.done = content_count pb.state = TASK_STATES.COMPLETED pb.save() gpr = TaskGroup.current().group_progress_reports.filter( code="import.repo.versions") gpr.update(done=F("done") + 1)
def pulp_import(importer_pk, path, toc): """ Import a Pulp export into Pulp. Args: importer_pk (str): Primary key of PulpImporter to do the import path (str): Path to the export to be imported """ def _compute_hash(filename): sha256_hash = hashlib.sha256() with open(filename, "rb") as f: # Read and update hash string value in blocks of 4K for byte_block in iter(lambda: f.read(4096), b""): sha256_hash.update(byte_block) return sha256_hash.hexdigest() def validate_toc(toc_filename): """ Check validity of table-of-contents file. table-of-contents must: * exist * be valid JSON * point to chunked-export-files that exist 'next to' the 'toc' file * point to chunks whose checksums match the checksums stored in the 'toc' file Args: toc_filename (str): The user-provided toc-file-path to be validated. Raises: ValidationError: If toc is not a valid JSON table-of-contents file, or when toc points to chunked-export-files that can't be found in the same directory as the toc-file, or the checksums of the chunks do not match the checksums stored in toc. """ with open(toc_filename) as json_file: # Valid JSON? the_toc = json.load(json_file) if not the_toc.get("files", None) or not the_toc.get("meta", None): raise ValidationError( _("Missing 'files' or 'meta' keys in table-of-contents!")) base_dir = os.path.dirname(toc_filename) # Points at chunks that exist? missing_files = [] for f in sorted(the_toc["files"].keys()): if not os.path.isfile(os.path.join(base_dir, f)): missing_files.append(f) if missing_files: raise ValidationError( _("Missing import-chunks named in table-of-contents: {}.". format(str(missing_files)))) errs = [] # validate the sha256 of the toc-entries # gather errors for reporting at the end chunks = sorted(the_toc["files"].keys()) data = dict(message="Validating Chunks", code="validate.chunks", total=len(chunks)) with ProgressReport(**data) as pb: for chunk in pb.iter(chunks): a_hash = _compute_hash(os.path.join(base_dir, chunk)) if not a_hash == the_toc["files"][chunk]: err_str = "File {} expected checksum : {}, computed checksum : {}".format( chunk, the_toc["files"][chunk], a_hash) errs.append(err_str) # if there are any errors, report and fail if errs: raise ValidationError( _("Import chunk hash mismatch: {}).").format(str(errs))) return the_toc def validate_and_assemble(toc_filename): """Validate checksums of, and reassemble, chunks in table-of-contents file.""" the_toc = validate_toc(toc_filename) toc_dir = os.path.dirname(toc_filename) result_file = os.path.join(toc_dir, the_toc["meta"]["file"]) # if we have only one entry in "files", it must be the full .tar.gz - return it if len(the_toc["files"]) == 1: return os.path.join(toc_dir, list(the_toc["files"].keys())[0]) # We have multiple chunks. # reassemble into one file 'next to' the toc and return the resulting full-path chunk_size = int(the_toc["meta"]["chunk_size"]) offset = 0 block_size = 1024 blocks_per_chunk = int(chunk_size / block_size) # sorting-by-filename is REALLY IMPORTANT here # keys are of the form <base-export-name>.00..<base-export-name>.NN, # and must be reassembled IN ORDER the_chunk_files = sorted(the_toc["files"].keys()) data = dict(message="Recombining Chunks", code="recombine.chunks", total=len(the_chunk_files)) with ProgressReport(**data) as pb: for chunk in pb.iter(the_chunk_files): # For each chunk, add it to the reconstituted tar.gz, picking up where the previous # chunk left off subprocess.run([ "dd", "if={}".format(os.path.join(toc_dir, chunk)), "of={}".format(result_file), "bs={}".format(str(block_size)), "seek={}".format(str(offset)), ], ) offset += blocks_per_chunk # To keep from taking up All The Disk, we delete each chunk after it has been added # to the recombined file. try: subprocess.run(["rm", "-f", os.path.join(toc_dir, chunk)]) except OSError: log.warning( _("Failed to remove chunk {} after recombining. Continuing." ).format(os.path.join(toc_dir, chunk)), exc_info=True, ) combined_hash = _compute_hash(result_file) if combined_hash != the_toc["meta"]["global_hash"]: raise ValidationError( _("Mismatch between combined .tar.gz checksum [{}] and originating [{}])." ).format(combined_hash, the_toc["meta"]["global_hash"])) # if we get this far, then: the chunk-files all existed, they all pass checksum validation, # and there exists a combined .tar.gz, which *also* passes checksum-validation. # Let the rest of the import process do its thing on the new combined-file. return result_file if toc: log.info(_("Validating TOC {}.").format(toc)) path = validate_and_assemble(toc) log.info(_("Importing {}.").format(path)) current_task = Task.current() importer = PulpImporter.objects.get(pk=importer_pk) the_import = PulpImport.objects.create(importer=importer, task=current_task, params={"path": path}) CreatedResource.objects.create(content_object=the_import) task_group = TaskGroup.objects.create(description=f"Import of {path}") Task.objects.filter(pk=current_task.pk).update(task_group=task_group) current_task.refresh_from_db() CreatedResource.objects.create(content_object=task_group) with tempfile.TemporaryDirectory() as temp_dir: with tarfile.open(path, "r:gz") as tar: tar.extractall(path=temp_dir) # Check version info with open(os.path.join(temp_dir, VERSIONS_FILE)) as version_file: version_json = json.load(version_file) _check_versions(version_json) # Artifacts ar_result = _import_file(os.path.join(temp_dir, ARTIFACT_FILE), ArtifactResource) data = dict(message="Importing Artifacts", code="import.artifacts", total=len(ar_result.rows)) with ProgressReport(**data) as pb: for row in pb.iter(ar_result.rows): artifact = Artifact.objects.get(pk=row.object_id) base_path = os.path.join("artifact", artifact.sha256[0:2], artifact.sha256[2:]) src = os.path.join(temp_dir, base_path) dest = os.path.join(settings.MEDIA_ROOT, base_path) if not default_storage.exists(dest): with open(src, "rb") as f: default_storage.save(dest, f) with open(os.path.join(temp_dir, REPO_FILE), "r") as repo_data_file: data = json.load(repo_data_file) gpr = GroupProgressReport( message="Importing repository versions", code="import.repo.versions", total=len(data), done=0, task_group=task_group, ) gpr.save() for src_repo in data: try: dest_repo = _destination_repo(importer, src_repo["name"]) except Repository.DoesNotExist: log.warning( _("Could not find destination repo for {}. Skipping."). format(src_repo["name"])) continue dispatch( import_repository_version, [dest_repo], args=[importer.pk, dest_repo.pk, src_repo["name"], path], task_group=task_group, ) task_group.finish()
def orphan_cleanup(): """ Delete all orphan Content and Artifact records. This task removes Artifact files from the filesystem as well. """ content = Content.objects.filter(version_memberships__isnull=True).exclude( pulp_type=PublishedMetadata.get_pulp_type()) content_count = content.count() progress_bar = ProgressReport( message="Clean up orphan Content", total=content_count, code="clean-up.content", done=0, state="running", ) progress_bar.save() # delete the content for c in queryset_iterator(content): progress_bar.increase_by(c.count()) c.delete() progress_bar.state = "completed" progress_bar.save() # delete the artifacts that don't belong to any content artifacts = Artifact.objects.filter(content_memberships__isnull=True) progress_bar = ProgressReport( message="Clean up orphan Artifacts", total=artifacts.count(), code="clean-up.content", done=0, state="running", ) progress_bar.save() counter = 0 interval = 100 for artifact in artifacts.iterator(): # we need to manually call delete() because it cleans up the file on the filesystem artifact.delete() progress_bar.done += 1 counter += 1 if counter >= interval: progress_bar.save() counter = 0 progress_bar.state = "completed" progress_bar.save()
def import_repository_version(importer_pk, destination_repo_pk, source_repo_name, tar_path): """ Import a repository version from a Pulp export. Args: importer_pk (str): Importer we are working with destination_repo_pk (str): Primary key of Repository to import into. source_repo_name (str): Name of the Repository in the export. tar_path (str): A path to export tar. """ dest_repo = Repository.objects.get(pk=destination_repo_pk) importer = PulpImporter.objects.get(pk=importer_pk) pb = ProgressReport( message=f"Importing content for {dest_repo.name}", code="import.repo.version.content", state=TASK_STATES.RUNNING, ) pb.save() with tempfile.TemporaryDirectory() as temp_dir: # Extract the repo file for the repo info with tarfile.open(tar_path, "r:gz") as tar: tar.extract(REPO_FILE, path=temp_dir) with open(os.path.join(temp_dir, REPO_FILE), "r") as repo_data_file: data = json.load(repo_data_file) src_repo = next(repo for repo in data if repo["name"] == source_repo_name) rv_path = os.path.join(temp_dir, _repo_version_path(src_repo)) if dest_repo.pulp_type != src_repo["pulp_type"]: raise ValidationError( _( "Repository type mismatch: {src_repo} ({src_type}) vs {dest_repo} " "({dest_type})." ).format( src_repo=src_repo["name"], src_type=src_repo["pulp_type"], dest_repo=dest_repo.name, dest_type=dest_repo.pulp_type, ) ) # Extract the repo version files with tarfile.open(tar_path, "r:gz") as tar: for mem in tar.getmembers(): if re.match(fr"^{_repo_version_path(src_repo)}/.+", mem.name): tar.extract(mem, path=temp_dir) # Content plugin_name = src_repo["pulp_type"].split(".")[0] cfg = get_plugin_config(plugin_name) resulting_content_ids = [] for res_class in cfg.exportable_classes: filename = f"{res_class.__module__}.{res_class.__name__}.json" a_result = _import_file(os.path.join(rv_path, filename), res_class) resulting_content_ids.extend( row.object_id for row in a_result.rows if row.import_type in ("new", "update") ) # Once all content exists, create the ContentArtifact links ca_path = os.path.join(rv_path, CA_FILE) _import_file(ca_path, ContentArtifactResource) # see if we have a content mapping mapping_path = f"{_repo_version_path(src_repo)}/{CONTENT_MAPPING_FILE}" mapping = {} with tarfile.open(tar_path, "r:gz") as tar: if mapping_path in tar.getnames(): tar.extract(mapping_path, path=temp_dir) with open(os.path.join(temp_dir, mapping_path), "r") as mapping_file: mapping = json.load(mapping_file) if mapping: # use the content mapping to map content to repos for repo_name, content_ids in mapping.items(): repo = _destination_repo(importer, repo_name) content = Content.objects.filter(upstream_id__in=content_ids) with repo.new_version() as new_version: new_version.set_content(content) else: # just map all the content to our destination repo content = Content.objects.filter(pk__in=resulting_content_ids) with dest_repo.new_version() as new_version: new_version.set_content(content) content_count = content.count() pb.total = content_count pb.done = content_count pb.state = TASK_STATES.COMPLETED pb.save() gpr = TaskGroup.current().group_progress_reports.filter(code="import.repo.versions") gpr.update(done=F("done") + 1)
def orphan_cleanup(content_pks=None): """ Delete all orphan Content and Artifact records. Go through orphan Content multiple times to remove content from subrepos. This task removes Artifact files from the filesystem as well. Kwargs: content_pks (list): A list of content pks. If specified, only remove these orphans. """ progress_bar = ProgressReport( message="Clean up orphan Content", total=0, code="clean-up.content", done=0, state="running", ) while True: content = Content.objects.filter( version_memberships__isnull=True).exclude( pulp_type=PublishedMetadata.get_pulp_type()) if content_pks: content = content.filter(pk__in=content_pks) content_count = content.count() if not content_count: break progress_bar.total += content_count progress_bar.save() # delete the content for c in queryset_iterator(content): progress_bar.increase_by(c.count()) c.delete() progress_bar.state = "completed" progress_bar.save() # delete the artifacts that don't belong to any content artifacts = Artifact.objects.filter(content_memberships__isnull=True) progress_bar = ProgressReport( message="Clean up orphan Artifacts", total=artifacts.count(), code="clean-up.content", done=0, state="running", ) progress_bar.save() counter = 0 interval = 100 for artifact in artifacts.iterator(): # we need to manually call delete() because it cleans up the file on the filesystem artifact.delete() progress_bar.done += 1 counter += 1 if counter >= interval: progress_bar.save() counter = 0 progress_bar.state = "completed" progress_bar.save()
def orphan_cleanup(): """ Delete all orphan Content and Artifact records. This task removes Artifact files from the filesystem as well. """ # Content cleanup content = Content.objects.exclude(pk__in=RepositoryContent.objects.values_list('content_id', flat=True)) content = content.exclude(pulp_type='core.{}'.format(PublishedMetadata.TYPE)) progress_bar = ProgressReport(message='Clean up orphan Content', total=content.count(), code='clean-up.content', done=0, state='running') progress_bar.save() content.delete() progress_bar.done = progress_bar.total progress_bar.state = 'completed' progress_bar.save() # Artifact cleanup artifacts = Artifact.objects.exclude(pk__in=ContentArtifact.objects.values_list('artifact_id', flat=True)) progress_bar = ProgressReport(message='Clean up orphan Artifacts', total=artifacts.count(), code='clean-up.content', done=0, state='running') progress_bar.save() for artifact in artifacts: # we need to manually call delete() because it cleans up the file on the filesystem artifact.delete() progress_bar.increment() progress_bar.state = 'completed' progress_bar.save()
def reclaim_space(repo_pks, keeplist_rv_pks=None, force=False): """ This task frees-up disk space by removing Artifact files from the filesystem for Content exclusive to the list of provided repos. Note: content marked as `proctected` will be excluded from the reclaim disk space. Kwargs: repo_pks (list): A list of repo pks the disk reclaim space is performed on. keeplist_rv_pks (list): A list of repo version pks that will be excluded from the reclaim disk space. force (bool): If True, uploaded content will be taken into account. """ reclaimed_repos = Repository.objects.filter(pk__in=repo_pks) for repo in reclaimed_repos: repo.invalidate_cache(everything=True) rest_of_repos = Repository.objects.exclude(pk__in=repo_pks) c_keep_qs = Content.objects.filter(repositories__in=rest_of_repos) c_reclaim_qs = Content.objects.filter(repositories__in=repo_pks) c_reclaim_qs = c_reclaim_qs.exclude( pk__in=c_keep_qs, pulp_type=PublishedMetadata.get_pulp_type()) if keeplist_rv_pks: rv_qs = RepositoryVersion.objects.filter(pk__in=keeplist_rv_pks) rv_content = Content.objects.none() for rv in rv_qs.iterator(): rv_content |= rv.content c_reclaim_qs = c_reclaim_qs.exclude(pk__in=rv_content) content_distinct = c_reclaim_qs.distinct("pulp_type") unprotected = [] for content in content_distinct: if not content.cast().PROTECTED_FROM_RECLAIM: unprotected.append(content.pulp_type) ca_qs = ContentArtifact.objects.filter( content__in=c_reclaim_qs.values("pk"), artifact__isnull=False) if not force: ca_qs = ca_qs.filter(remoteartifact__isnull=False) artifact_pks = set() ca_to_update = [] for ca in ca_qs.iterator(): if ca.content.pulp_type in unprotected: artifact_pks.add(ca.artifact.pk) ca.artifact = None ca_to_update.append(ca) ContentArtifact.objects.bulk_update(objs=ca_to_update, fields=["artifact"], batch_size=1000) artifacts_to_delete = Artifact.objects.filter(pk__in=artifact_pks) progress_bar = ProgressReport( message="Reclaim disk space", total=artifacts_to_delete.count(), code="reclaim-space.artifact", done=0, state="running", ) progress_bar.save() counter = 0 interval = 100 for artifact in artifacts_to_delete.iterator(): # we need to manually call delete() because it cleans up the file on the filesystem artifact.delete() progress_bar.done += 1 counter += 1 if counter >= interval: progress_bar.save() counter = 0 progress_bar.state = "completed" progress_bar.save()