Esempio n. 1
0
async def _repair_artifacts_for_content(subset=None, verify_checksums=True):
    loop = asyncio.get_event_loop()
    pending = set()

    query_set = models.ContentArtifact.objects.exclude(artifact__isnull=True)

    if subset:
        query_set = query_set.filter(content__in=subset)

    with ProgressReport(
            message="Identify missing units",
            code="repair.missing") as missing, ProgressReport(
                message="Identify corrupted units",
                code="repair.corrupted") as corrupted, ProgressReport(
                    message="Repair corrupted units",
                    code="repair.repaired") as repaired:

        with ThreadPoolExecutor(max_workers=2) as checksum_executor:
            for content_artifact in query_set.select_related(
                    "artifact").iterator():
                artifact = content_artifact.artifact

                valid = await loop.run_in_executor(None,
                                                   default_storage.exists,
                                                   artifact.file.name)
                if not valid:
                    missing.increment()
                    log.warn(_("Missing file for {}").format(artifact))
                elif verify_checksums:
                    # default ThreadPoolExecutor uses num cores x 5 threads. Since we're doing
                    # such long and sequential reads, using too many threads might hurt more
                    # than help (on HDDs, maybe not on SSDs) by making the disk access pattern
                    # more random. Put it in a separate executor with limited threads.
                    # Should stay in (an) executor so that at least it doesn't completely block
                    # downloads.
                    valid = await loop.run_in_executor(checksum_executor,
                                                       _verify_artifact,
                                                       artifact)
                    if not valid:
                        corrupted.increment()
                        log.warn(_("Digest mismatch for {}").format(artifact))

                if not valid:
                    if len(
                            pending
                    ) >= 5:  # Limit the number of concurrent repair tasks
                        done, pending = await asyncio.wait(
                            pending, return_when=asyncio.FIRST_COMPLETED)
                        await asyncio.gather(*done)  # Clean up tasks
                    pending.add(
                        asyncio.ensure_future(
                            _repair_ca(content_artifact, repaired)))
        await asyncio.gather(*pending)
Esempio n. 2
0
    def validate_toc(toc_filename):
        """
        Check validity of table-of-contents file.

        table-of-contents must:
          * exist
          * be valid JSON
          * point to chunked-export-files that exist 'next to' the 'toc' file
          * point to chunks whose checksums match the checksums stored in the 'toc' file

        Args:
            toc_filename (str): The user-provided toc-file-path to be validated.

        Raises:
            ValidationError: If toc is not a valid JSON table-of-contents file,
            or when toc points to chunked-export-files that can't be found in the same
            directory as the toc-file, or the checksums of the chunks do not match the
            checksums stored in toc.
        """
        with open(toc_filename) as json_file:
            # Valid JSON?
            the_toc = json.load(json_file)
            if not the_toc.get("files", None) or not the_toc.get("meta", None):
                raise ValidationError(_("Missing 'files' or 'meta' keys in table-of-contents!"))

            base_dir = os.path.dirname(toc_filename)
            # Points at chunks that exist?
            missing_files = []
            for f in sorted(the_toc["files"].keys()):
                if not os.path.isfile(os.path.join(base_dir, f)):
                    missing_files.append(f)
            if missing_files:
                raise ValidationError(
                    _(
                        "Missing import-chunks named in table-of-contents: {}.".format(
                            str(missing_files)
                        )
                    )
                )

            errs = []
            # validate the sha256 of the toc-entries
            # gather errors for reporting at the end
            chunks = sorted(the_toc["files"].keys())
            data = dict(message="Validating Chunks", code="validate.chunks", total=len(chunks))
            with ProgressReport(**data) as pb:
                for chunk in pb.iter(chunks):
                    a_hash = _compute_hash(os.path.join(base_dir, chunk))
                    if not a_hash == the_toc["files"][chunk]:
                        err_str = "File {} expected checksum : {}, computed checksum : {}".format(
                            chunk, the_toc["files"][chunk], a_hash
                        )
                        errs.append(err_str)

            # if there are any errors, report and fail
            if errs:
                raise ValidationError(_("Import chunk hash mismatch: {}).").format(str(errs)))

        return the_toc
Esempio n. 3
0
    def validate_and_assemble(toc_filename):
        """Validate checksums of, and reassemble, chunks in table-of-contents file."""
        the_toc = validate_toc(toc_filename)
        toc_dir = os.path.dirname(toc_filename)
        result_file = os.path.join(toc_dir, the_toc["meta"]["file"])

        # if we have only one entry in "files", it must be the full .tar.gz - return it
        if len(the_toc["files"]) == 1:
            return os.path.join(toc_dir, list(the_toc["files"].keys())[0])

        # We have multiple chunks.
        # reassemble into one file 'next to' the toc and return the resulting full-path
        chunk_size = int(the_toc["meta"]["chunk_size"])
        offset = 0
        block_size = 1024
        blocks_per_chunk = int(chunk_size / block_size)

        # sorting-by-filename is REALLY IMPORTANT here
        # keys are of the form <base-export-name>.00..<base-export-name>.NN,
        # and must be reassembled IN ORDER
        the_chunk_files = sorted(the_toc["files"].keys())

        data = dict(message="Recombining Chunks",
                    code="recombine.chunks",
                    total=len(the_chunk_files))
        with ProgressReport(**data) as pb:
            for chunk in pb.iter(the_chunk_files):
                # For each chunk, add it to the reconstituted tar.gz, picking up where the previous
                # chunk left off
                subprocess.run([
                    "dd",
                    "if={}".format(os.path.join(toc_dir, chunk)),
                    "of={}".format(result_file),
                    "bs={}".format(str(block_size)),
                    "seek={}".format(str(offset)),
                ], )
                offset += blocks_per_chunk
                # To keep from taking up All The Disk, we delete each chunk after it has been added
                # to the recombined file.
                try:
                    subprocess.run(["rm", "-f", os.path.join(toc_dir, chunk)])
                except OSError:
                    log.warning(
                        _("Failed to remove chunk {} after recombining. Continuing."
                          ).format(os.path.join(toc_dir, chunk)),
                        exc_info=True,
                    )

        combined_hash = _compute_hash(result_file)
        if combined_hash != the_toc["meta"]["global_hash"]:
            raise ValidationError(
                _("Mismatch between combined .tar.gz checksum [{}] and originating [{}])."
                  ).format(combined_hash, the_toc["meta"]["global_hash"]))
        # if we get this far, then: the chunk-files all existed, they all pass checksum validation,
        # and there exists a combined .tar.gz, which *also* passes checksum-validation.
        # Let the rest of the import process do its thing on the new combined-file.
        return result_file
Esempio n. 4
0
def purge(finished_before, states):
    """
    This task purges from the database records of tasks which finished prior to the specified time.

    It will remove only tasks that are 'owned' by the current-user (admin-users own All The Things,
    so admins can delete all tasks).

    It will not remove tasks that are incomplete (ie, in states running|waiting|cancelling).

    It reports (using ProgressReport) the total entities deleted, as well as individual counts
    for each class of entity. This shows the results of cascading-deletes that are triggered
    by deleting a Task.

    Args:
        finished_before (DateTime): Earliest finished-time to **NOT** purge.
        states (List[str]): List of task-states we want to purge.

    """
    current_user = get_current_authenticated_user()
    qs = Task.objects.filter(finished_at__lt=finished_before, state__in=states)
    units_deleted, details = get_objects_for_user(current_user,
                                                  "core.delete_task",
                                                  qs=qs).delete()

    # Progress bar reporting total-units
    progress_bar = ProgressReport(
        message=_("Purged task-objects total"),
        total=units_deleted,
        code="purge.tasks.total",
        done=units_deleted,
        state="completed",
    )
    progress_bar.save()
    # This loop reports back the specific entities deleted and the number removed
    for key in details:
        progress_bar = ProgressReport(
            message=_("Purged task-objects of type {}".format(key)),
            total=details[key],
            code="purge.tasks.key.{}".format(key),
            done=details[key],
            state="completed",
        )
        progress_bar.save()
Esempio n. 5
0
def import_repository_version(importer_pk, destination_repo_pk,
                              source_repo_name, tar_path):
    """
    Import a repository version from a Pulp export.

    Args:
        importer_pk (str): Importer we are working with
        destination_repo_pk (str): Primary key of Repository to import into.
        source_repo_name (str): Name of the Repository in the export.
        tar_path (str): A path to export tar.
    """
    dest_repo = Repository.objects.get(pk=destination_repo_pk)
    importer = PulpImporter.objects.get(pk=importer_pk)

    pb = ProgressReport(
        message=f"Importing content for {dest_repo.name}",
        code="import.repo.version.content",
        state=TASK_STATES.RUNNING,
    )
    pb.save()

    with tempfile.TemporaryDirectory() as temp_dir:
        # Extract the repo file for the repo info
        with tarfile.open(tar_path, "r:gz") as tar:
            tar.extract(REPO_FILE, path=temp_dir)

        with open(os.path.join(temp_dir, REPO_FILE), "r") as repo_data_file:
            data = json.load(repo_data_file)

        src_repo = next(repo for repo in data
                        if repo["name"] == source_repo_name)

        if dest_repo.pulp_type != src_repo["pulp_type"]:
            raise ValidationError(
                _("Repository type mismatch: {src_repo} ({src_type}) vs {dest_repo} "
                  "({dest_type}).").format(
                      src_repo=src_repo["name"],
                      src_type=src_repo["pulp_type"],
                      dest_repo=dest_repo.name,
                      dest_type=dest_repo.pulp_type,
                  ))

        rv_name = ""
        # Extract the repo version files
        with tarfile.open(tar_path, "r:gz") as tar:
            for mem in tar.getmembers():
                match = re.search(
                    fr"(^repository-{source_repo_name}_[0-9]+)/.+", mem.name)
                if match:
                    rv_name = match.group(1)
                    tar.extract(mem, path=temp_dir)

        if not rv_name:
            raise ValidationError(
                _("No RepositoryVersion found for {}").format(rv_name))

        rv_path = os.path.join(temp_dir, rv_name)
        # Content
        plugin_name = src_repo["pulp_type"].split(".")[0]
        cfg = get_plugin_config(plugin_name)

        resulting_content_ids = []
        for res_class in cfg.exportable_classes:
            filename = f"{res_class.__module__}.{res_class.__name__}.json"
            a_result = _import_file(os.path.join(rv_path, filename),
                                    res_class,
                                    do_raise=False)
            # django import-export can have a problem with concurrent-imports that are
            # importing the same 'thing' (e.g., a Package that exists in two different
            # repo-versions that are being imported at the same time). We will try an import
            # that will simply record errors as they happen (rather than failing with an exception)
            # first. If errors happen, we'll do one retry before we give up on this repo-version's
            # import.
            if a_result.has_errors():
                log.info(
                    _("...{} import-errors encountered importing {} from {}, retrying"
                      ).format(a_result.totals["error"], filename, rv_name))
                # Second attempt, we allow to raise an exception on any problem.
                # This will either succeed, or log a fatal error and fail.
                try:
                    a_result = _import_file(os.path.join(rv_path, filename),
                                            res_class)
                except Exception as e:  # noqa log on ANY exception and then re-raise
                    log.error(
                        _("FATAL import-failure importing {} from {}").format(
                            filename, rv_name))
                    raise

            resulting_content_ids.extend(row.object_id for row in a_result.rows
                                         if row.import_type in ("new",
                                                                "update"))

        # Once all content exists, create the ContentArtifact links
        ca_path = os.path.join(rv_path, CA_FILE)
        _import_file(ca_path, ContentArtifactResource)

        # see if we have a content mapping
        mapping_path = f"{rv_name}/{CONTENT_MAPPING_FILE}"
        mapping = {}
        with tarfile.open(tar_path, "r:gz") as tar:
            if mapping_path in tar.getnames():
                tar.extract(mapping_path, path=temp_dir)
                with open(os.path.join(temp_dir, mapping_path),
                          "r") as mapping_file:
                    mapping = json.load(mapping_file)

        if mapping:
            # use the content mapping to map content to repos
            for repo_name, content_ids in mapping.items():
                repo = _destination_repo(importer, repo_name)
                content = Content.objects.filter(upstream_id__in=content_ids)
                with repo.new_version() as new_version:
                    new_version.set_content(content)
        else:
            # just map all the content to our destination repo
            content = Content.objects.filter(pk__in=resulting_content_ids)
            with dest_repo.new_version() as new_version:
                new_version.set_content(content)

        content_count = content.count()
        pb.total = content_count
        pb.done = content_count
        pb.state = TASK_STATES.COMPLETED
        pb.save()

    gpr = TaskGroup.current().group_progress_reports.filter(
        code="import.repo.versions")
    gpr.update(done=F("done") + 1)
Esempio n. 6
0
def pulp_import(importer_pk, path, toc):
    """
    Import a Pulp export into Pulp.

    Args:
        importer_pk (str): Primary key of PulpImporter to do the import
        path (str): Path to the export to be imported
    """
    def _compute_hash(filename):
        sha256_hash = hashlib.sha256()
        with open(filename, "rb") as f:
            # Read and update hash string value in blocks of 4K
            for byte_block in iter(lambda: f.read(4096), b""):
                sha256_hash.update(byte_block)
            return sha256_hash.hexdigest()

    def validate_toc(toc_filename):
        """
        Check validity of table-of-contents file.

        table-of-contents must:
          * exist
          * be valid JSON
          * point to chunked-export-files that exist 'next to' the 'toc' file
          * point to chunks whose checksums match the checksums stored in the 'toc' file

        Args:
            toc_filename (str): The user-provided toc-file-path to be validated.

        Raises:
            ValidationError: If toc is not a valid JSON table-of-contents file,
            or when toc points to chunked-export-files that can't be found in the same
            directory as the toc-file, or the checksums of the chunks do not match the
            checksums stored in toc.
        """
        with open(toc_filename) as json_file:
            # Valid JSON?
            the_toc = json.load(json_file)
            if not the_toc.get("files", None) or not the_toc.get("meta", None):
                raise ValidationError(
                    _("Missing 'files' or 'meta' keys in table-of-contents!"))

            base_dir = os.path.dirname(toc_filename)
            # Points at chunks that exist?
            missing_files = []
            for f in sorted(the_toc["files"].keys()):
                if not os.path.isfile(os.path.join(base_dir, f)):
                    missing_files.append(f)
            if missing_files:
                raise ValidationError(
                    _("Missing import-chunks named in table-of-contents: {}.".
                      format(str(missing_files))))

            errs = []
            # validate the sha256 of the toc-entries
            # gather errors for reporting at the end
            chunks = sorted(the_toc["files"].keys())
            data = dict(message="Validating Chunks",
                        code="validate.chunks",
                        total=len(chunks))
            with ProgressReport(**data) as pb:
                for chunk in pb.iter(chunks):
                    a_hash = _compute_hash(os.path.join(base_dir, chunk))
                    if not a_hash == the_toc["files"][chunk]:
                        err_str = "File {} expected checksum : {}, computed checksum : {}".format(
                            chunk, the_toc["files"][chunk], a_hash)
                        errs.append(err_str)

            # if there are any errors, report and fail
            if errs:
                raise ValidationError(
                    _("Import chunk hash mismatch: {}).").format(str(errs)))

        return the_toc

    def validate_and_assemble(toc_filename):
        """Validate checksums of, and reassemble, chunks in table-of-contents file."""
        the_toc = validate_toc(toc_filename)
        toc_dir = os.path.dirname(toc_filename)
        result_file = os.path.join(toc_dir, the_toc["meta"]["file"])

        # if we have only one entry in "files", it must be the full .tar.gz - return it
        if len(the_toc["files"]) == 1:
            return os.path.join(toc_dir, list(the_toc["files"].keys())[0])

        # We have multiple chunks.
        # reassemble into one file 'next to' the toc and return the resulting full-path
        chunk_size = int(the_toc["meta"]["chunk_size"])
        offset = 0
        block_size = 1024
        blocks_per_chunk = int(chunk_size / block_size)

        # sorting-by-filename is REALLY IMPORTANT here
        # keys are of the form <base-export-name>.00..<base-export-name>.NN,
        # and must be reassembled IN ORDER
        the_chunk_files = sorted(the_toc["files"].keys())

        data = dict(message="Recombining Chunks",
                    code="recombine.chunks",
                    total=len(the_chunk_files))
        with ProgressReport(**data) as pb:
            for chunk in pb.iter(the_chunk_files):
                # For each chunk, add it to the reconstituted tar.gz, picking up where the previous
                # chunk left off
                subprocess.run([
                    "dd",
                    "if={}".format(os.path.join(toc_dir, chunk)),
                    "of={}".format(result_file),
                    "bs={}".format(str(block_size)),
                    "seek={}".format(str(offset)),
                ], )
                offset += blocks_per_chunk
                # To keep from taking up All The Disk, we delete each chunk after it has been added
                # to the recombined file.
                try:
                    subprocess.run(["rm", "-f", os.path.join(toc_dir, chunk)])
                except OSError:
                    log.warning(
                        _("Failed to remove chunk {} after recombining. Continuing."
                          ).format(os.path.join(toc_dir, chunk)),
                        exc_info=True,
                    )

        combined_hash = _compute_hash(result_file)
        if combined_hash != the_toc["meta"]["global_hash"]:
            raise ValidationError(
                _("Mismatch between combined .tar.gz checksum [{}] and originating [{}])."
                  ).format(combined_hash, the_toc["meta"]["global_hash"]))
        # if we get this far, then: the chunk-files all existed, they all pass checksum validation,
        # and there exists a combined .tar.gz, which *also* passes checksum-validation.
        # Let the rest of the import process do its thing on the new combined-file.
        return result_file

    if toc:
        log.info(_("Validating TOC {}.").format(toc))
        path = validate_and_assemble(toc)

    log.info(_("Importing {}.").format(path))
    current_task = Task.current()
    importer = PulpImporter.objects.get(pk=importer_pk)
    the_import = PulpImport.objects.create(importer=importer,
                                           task=current_task,
                                           params={"path": path})
    CreatedResource.objects.create(content_object=the_import)

    task_group = TaskGroup.objects.create(description=f"Import of {path}")
    Task.objects.filter(pk=current_task.pk).update(task_group=task_group)
    current_task.refresh_from_db()
    CreatedResource.objects.create(content_object=task_group)

    with tempfile.TemporaryDirectory() as temp_dir:
        with tarfile.open(path, "r:gz") as tar:
            tar.extractall(path=temp_dir)

        # Check version info
        with open(os.path.join(temp_dir, VERSIONS_FILE)) as version_file:
            version_json = json.load(version_file)
            _check_versions(version_json)

        # Artifacts
        ar_result = _import_file(os.path.join(temp_dir, ARTIFACT_FILE),
                                 ArtifactResource)
        data = dict(message="Importing Artifacts",
                    code="import.artifacts",
                    total=len(ar_result.rows))
        with ProgressReport(**data) as pb:
            for row in pb.iter(ar_result.rows):
                artifact = Artifact.objects.get(pk=row.object_id)
                base_path = os.path.join("artifact", artifact.sha256[0:2],
                                         artifact.sha256[2:])
                src = os.path.join(temp_dir, base_path)
                dest = os.path.join(settings.MEDIA_ROOT, base_path)

                if not default_storage.exists(dest):
                    with open(src, "rb") as f:
                        default_storage.save(dest, f)

        with open(os.path.join(temp_dir, REPO_FILE), "r") as repo_data_file:
            data = json.load(repo_data_file)
            gpr = GroupProgressReport(
                message="Importing repository versions",
                code="import.repo.versions",
                total=len(data),
                done=0,
                task_group=task_group,
            )
            gpr.save()

            for src_repo in data:
                try:
                    dest_repo = _destination_repo(importer, src_repo["name"])
                except Repository.DoesNotExist:
                    log.warning(
                        _("Could not find destination repo for {}. Skipping.").
                        format(src_repo["name"]))
                    continue

                dispatch(
                    import_repository_version,
                    [dest_repo],
                    args=[importer.pk, dest_repo.pk, src_repo["name"], path],
                    task_group=task_group,
                )

    task_group.finish()
Esempio n. 7
0
def orphan_cleanup():
    """
    Delete all orphan Content and Artifact records.
    This task removes Artifact files from the filesystem as well.
    """
    content = Content.objects.filter(version_memberships__isnull=True).exclude(
        pulp_type=PublishedMetadata.get_pulp_type())

    content_count = content.count()
    progress_bar = ProgressReport(
        message="Clean up orphan Content",
        total=content_count,
        code="clean-up.content",
        done=0,
        state="running",
    )
    progress_bar.save()

    # delete the content
    for c in queryset_iterator(content):
        progress_bar.increase_by(c.count())
        c.delete()

    progress_bar.state = "completed"
    progress_bar.save()

    # delete the artifacts that don't belong to any content
    artifacts = Artifact.objects.filter(content_memberships__isnull=True)

    progress_bar = ProgressReport(
        message="Clean up orphan Artifacts",
        total=artifacts.count(),
        code="clean-up.content",
        done=0,
        state="running",
    )
    progress_bar.save()

    counter = 0
    interval = 100
    for artifact in artifacts.iterator():
        # we need to manually call delete() because it cleans up the file on the filesystem
        artifact.delete()
        progress_bar.done += 1
        counter += 1

        if counter >= interval:
            progress_bar.save()
            counter = 0

    progress_bar.state = "completed"
    progress_bar.save()
Esempio n. 8
0
def import_repository_version(importer_pk, destination_repo_pk, source_repo_name, tar_path):
    """
    Import a repository version from a Pulp export.

    Args:
        importer_pk (str): Importer we are working with
        destination_repo_pk (str): Primary key of Repository to import into.
        source_repo_name (str): Name of the Repository in the export.
        tar_path (str): A path to export tar.
    """
    dest_repo = Repository.objects.get(pk=destination_repo_pk)
    importer = PulpImporter.objects.get(pk=importer_pk)

    pb = ProgressReport(
        message=f"Importing content for {dest_repo.name}",
        code="import.repo.version.content",
        state=TASK_STATES.RUNNING,
    )
    pb.save()

    with tempfile.TemporaryDirectory() as temp_dir:
        # Extract the repo file for the repo info
        with tarfile.open(tar_path, "r:gz") as tar:
            tar.extract(REPO_FILE, path=temp_dir)

        with open(os.path.join(temp_dir, REPO_FILE), "r") as repo_data_file:
            data = json.load(repo_data_file)

        src_repo = next(repo for repo in data if repo["name"] == source_repo_name)
        rv_path = os.path.join(temp_dir, _repo_version_path(src_repo))

        if dest_repo.pulp_type != src_repo["pulp_type"]:
            raise ValidationError(
                _(
                    "Repository type mismatch: {src_repo} ({src_type}) vs {dest_repo} "
                    "({dest_type})."
                ).format(
                    src_repo=src_repo["name"],
                    src_type=src_repo["pulp_type"],
                    dest_repo=dest_repo.name,
                    dest_type=dest_repo.pulp_type,
                )
            )

        # Extract the repo version files
        with tarfile.open(tar_path, "r:gz") as tar:
            for mem in tar.getmembers():
                if re.match(fr"^{_repo_version_path(src_repo)}/.+", mem.name):
                    tar.extract(mem, path=temp_dir)

        # Content
        plugin_name = src_repo["pulp_type"].split(".")[0]
        cfg = get_plugin_config(plugin_name)

        resulting_content_ids = []
        for res_class in cfg.exportable_classes:
            filename = f"{res_class.__module__}.{res_class.__name__}.json"
            a_result = _import_file(os.path.join(rv_path, filename), res_class)
            resulting_content_ids.extend(
                row.object_id for row in a_result.rows if row.import_type in ("new", "update")
            )

        # Once all content exists, create the ContentArtifact links
        ca_path = os.path.join(rv_path, CA_FILE)
        _import_file(ca_path, ContentArtifactResource)

        # see if we have a content mapping
        mapping_path = f"{_repo_version_path(src_repo)}/{CONTENT_MAPPING_FILE}"
        mapping = {}
        with tarfile.open(tar_path, "r:gz") as tar:
            if mapping_path in tar.getnames():
                tar.extract(mapping_path, path=temp_dir)
                with open(os.path.join(temp_dir, mapping_path), "r") as mapping_file:
                    mapping = json.load(mapping_file)

        if mapping:
            # use the content mapping to map content to repos
            for repo_name, content_ids in mapping.items():
                repo = _destination_repo(importer, repo_name)
                content = Content.objects.filter(upstream_id__in=content_ids)
                with repo.new_version() as new_version:
                    new_version.set_content(content)
        else:
            # just map all the content to our destination repo
            content = Content.objects.filter(pk__in=resulting_content_ids)
            with dest_repo.new_version() as new_version:
                new_version.set_content(content)

        content_count = content.count()
        pb.total = content_count
        pb.done = content_count
        pb.state = TASK_STATES.COMPLETED
        pb.save()

    gpr = TaskGroup.current().group_progress_reports.filter(code="import.repo.versions")
    gpr.update(done=F("done") + 1)
Esempio n. 9
0
def orphan_cleanup(content_pks=None):
    """
    Delete all orphan Content and Artifact records.
    Go through orphan Content multiple times to remove content from subrepos.
    This task removes Artifact files from the filesystem as well.

    Kwargs:
        content_pks (list): A list of content pks. If specified, only remove these orphans.

    """
    progress_bar = ProgressReport(
        message="Clean up orphan Content",
        total=0,
        code="clean-up.content",
        done=0,
        state="running",
    )

    while True:
        content = Content.objects.filter(
            version_memberships__isnull=True).exclude(
                pulp_type=PublishedMetadata.get_pulp_type())
        if content_pks:
            content = content.filter(pk__in=content_pks)

        content_count = content.count()
        if not content_count:
            break

        progress_bar.total += content_count
        progress_bar.save()

        # delete the content
        for c in queryset_iterator(content):
            progress_bar.increase_by(c.count())
            c.delete()

    progress_bar.state = "completed"
    progress_bar.save()

    # delete the artifacts that don't belong to any content
    artifacts = Artifact.objects.filter(content_memberships__isnull=True)

    progress_bar = ProgressReport(
        message="Clean up orphan Artifacts",
        total=artifacts.count(),
        code="clean-up.content",
        done=0,
        state="running",
    )
    progress_bar.save()

    counter = 0
    interval = 100
    for artifact in artifacts.iterator():
        # we need to manually call delete() because it cleans up the file on the filesystem
        artifact.delete()
        progress_bar.done += 1
        counter += 1

        if counter >= interval:
            progress_bar.save()
            counter = 0

    progress_bar.state = "completed"
    progress_bar.save()
Esempio n. 10
0
def orphan_cleanup():
    """
    Delete all orphan Content and Artifact records.
    This task removes Artifact files from the filesystem as well.
    """
    # Content cleanup
    content = Content.objects.exclude(pk__in=RepositoryContent.objects.values_list('content_id',
                                                                                   flat=True))
    content = content.exclude(pulp_type='core.{}'.format(PublishedMetadata.TYPE))
    progress_bar = ProgressReport(message='Clean up orphan Content', total=content.count(),
                                  code='clean-up.content', done=0, state='running')
    progress_bar.save()
    content.delete()
    progress_bar.done = progress_bar.total
    progress_bar.state = 'completed'
    progress_bar.save()

    # Artifact cleanup
    artifacts = Artifact.objects.exclude(pk__in=ContentArtifact.objects.values_list('artifact_id',
                                                                                    flat=True))
    progress_bar = ProgressReport(message='Clean up orphan Artifacts', total=artifacts.count(),
                                  code='clean-up.content', done=0, state='running')
    progress_bar.save()
    for artifact in artifacts:
        # we need to manually call delete() because it cleans up the file on the filesystem
        artifact.delete()
        progress_bar.increment()

    progress_bar.state = 'completed'
    progress_bar.save()
Esempio n. 11
0
def reclaim_space(repo_pks, keeplist_rv_pks=None, force=False):
    """
    This task frees-up disk space by removing Artifact files from the filesystem for Content
    exclusive to the list of provided repos.

    Note: content marked as `proctected` will be excluded from the reclaim disk space.

    Kwargs:
        repo_pks (list): A list of repo pks the disk reclaim space is performed on.
        keeplist_rv_pks (list): A list of repo version pks that will be excluded from the reclaim
        disk space.
        force (bool): If True, uploaded content will be taken into account.

    """
    reclaimed_repos = Repository.objects.filter(pk__in=repo_pks)
    for repo in reclaimed_repos:
        repo.invalidate_cache(everything=True)

    rest_of_repos = Repository.objects.exclude(pk__in=repo_pks)
    c_keep_qs = Content.objects.filter(repositories__in=rest_of_repos)
    c_reclaim_qs = Content.objects.filter(repositories__in=repo_pks)
    c_reclaim_qs = c_reclaim_qs.exclude(
        pk__in=c_keep_qs, pulp_type=PublishedMetadata.get_pulp_type())

    if keeplist_rv_pks:
        rv_qs = RepositoryVersion.objects.filter(pk__in=keeplist_rv_pks)
        rv_content = Content.objects.none()
        for rv in rv_qs.iterator():
            rv_content |= rv.content
        c_reclaim_qs = c_reclaim_qs.exclude(pk__in=rv_content)

    content_distinct = c_reclaim_qs.distinct("pulp_type")
    unprotected = []
    for content in content_distinct:
        if not content.cast().PROTECTED_FROM_RECLAIM:
            unprotected.append(content.pulp_type)

    ca_qs = ContentArtifact.objects.filter(
        content__in=c_reclaim_qs.values("pk"), artifact__isnull=False)
    if not force:
        ca_qs = ca_qs.filter(remoteartifact__isnull=False)
    artifact_pks = set()
    ca_to_update = []
    for ca in ca_qs.iterator():
        if ca.content.pulp_type in unprotected:
            artifact_pks.add(ca.artifact.pk)
            ca.artifact = None
            ca_to_update.append(ca)

    ContentArtifact.objects.bulk_update(objs=ca_to_update,
                                        fields=["artifact"],
                                        batch_size=1000)
    artifacts_to_delete = Artifact.objects.filter(pk__in=artifact_pks)
    progress_bar = ProgressReport(
        message="Reclaim disk space",
        total=artifacts_to_delete.count(),
        code="reclaim-space.artifact",
        done=0,
        state="running",
    )
    progress_bar.save()

    counter = 0
    interval = 100
    for artifact in artifacts_to_delete.iterator():
        # we need to manually call delete() because it cleans up the file on the filesystem
        artifact.delete()
        progress_bar.done += 1
        counter += 1

        if counter >= interval:
            progress_bar.save()
            counter = 0

    progress_bar.state = "completed"
    progress_bar.save()