Example #1
0
def orphan_cleanup():
    """
    Delete all orphan Content and Artifact records.
    This task removes Artifact files from the filesystem as well.
    """
    # Content cleanup
    content = Content.objects.exclude(pk__in=RepositoryContent.objects.values_list('content_id',
                                                                                   flat=True))
    content = content.exclude(pulp_type='core.{}'.format(PublishedMetadata.TYPE))
    progress_bar = ProgressReport(message='Clean up orphan Content', total=content.count(),
                                  code='clean-up.content', done=0, state='running')
    progress_bar.save()
    content.delete()
    progress_bar.done = progress_bar.total
    progress_bar.state = 'completed'
    progress_bar.save()

    # Artifact cleanup
    artifacts = Artifact.objects.exclude(pk__in=ContentArtifact.objects.values_list('artifact_id',
                                                                                    flat=True))
    progress_bar = ProgressReport(message='Clean up orphan Artifacts', total=artifacts.count(),
                                  code='clean-up.content', done=0, state='running')
    progress_bar.save()
    for artifact in artifacts:
        # we need to manually call delete() because it cleans up the file on the filesystem
        artifact.delete()
        progress_bar.increment()

    progress_bar.state = 'completed'
    progress_bar.save()
Example #2
0
def orphan_cleanup():
    """
    Delete all orphan Content and Artifact records.
    Go through orphan Content multiple times to remove content from subrepos.
    This task removes Artifact files from the filesystem as well.

    """
    progress_bar = ProgressReport(
        message="Clean up orphan Content",
        total=0,
        code="clean-up.content",
        done=0,
        state="running",
    )

    while True:
        content = Content.objects.filter(
            version_memberships__isnull=True).exclude(
                pulp_type=PublishedMetadata.get_pulp_type())
        content_count = content.count()
        if not content_count:
            break

        progress_bar.total += content_count
        progress_bar.save()

        # delete the content
        for c in queryset_iterator(content):
            progress_bar.increase_by(c.count())
            c.delete()

    progress_bar.state = "completed"
    progress_bar.save()

    # delete the artifacts that don't belong to any content
    artifacts = Artifact.objects.filter(content_memberships__isnull=True)

    progress_bar = ProgressReport(
        message="Clean up orphan Artifacts",
        total=artifacts.count(),
        code="clean-up.content",
        done=0,
        state="running",
    )
    progress_bar.save()

    counter = 0
    interval = 100
    for artifact in artifacts.iterator():
        # we need to manually call delete() because it cleans up the file on the filesystem
        artifact.delete()
        progress_bar.done += 1
        counter += 1

        if counter >= interval:
            progress_bar.save()
            counter = 0

    progress_bar.state = "completed"
    progress_bar.save()
Example #3
0
def import_repository_version(importer_pk, destination_repo_pk,
                              source_repo_name, tar_path):
    """
    Import a repository version from a Pulp export.

    Args:
        importer_pk (str): Importer we are working with
        destination_repo_pk (str): Primary key of Repository to import into.
        source_repo_name (str): Name of the Repository in the export.
        tar_path (str): A path to export tar.
    """
    dest_repo = Repository.objects.get(pk=destination_repo_pk)
    importer = PulpImporter.objects.get(pk=importer_pk)

    pb = ProgressReport(
        message=f"Importing content for {dest_repo.name}",
        code="import.repo.version.content",
        state=TASK_STATES.RUNNING,
    )
    pb.save()

    with tempfile.TemporaryDirectory() as temp_dir:
        # Extract the repo file for the repo info
        with tarfile.open(tar_path, "r:gz") as tar:
            tar.extract(REPO_FILE, path=temp_dir)

        with open(os.path.join(temp_dir, REPO_FILE), "r") as repo_data_file:
            data = json.load(repo_data_file)

        src_repo = next(repo for repo in data
                        if repo["name"] == source_repo_name)

        if dest_repo.pulp_type != src_repo["pulp_type"]:
            raise ValidationError(
                _("Repository type mismatch: {src_repo} ({src_type}) vs {dest_repo} "
                  "({dest_type}).").format(
                      src_repo=src_repo["name"],
                      src_type=src_repo["pulp_type"],
                      dest_repo=dest_repo.name,
                      dest_type=dest_repo.pulp_type,
                  ))

        rv_name = ""
        # Extract the repo version files
        with tarfile.open(tar_path, "r:gz") as tar:
            for mem in tar.getmembers():
                match = re.search(
                    fr"(^repository-{source_repo_name}_[0-9]+)/.+", mem.name)
                if match:
                    rv_name = match.group(1)
                    tar.extract(mem, path=temp_dir)

        if not rv_name:
            raise ValidationError(
                _("No RepositoryVersion found for {}").format(rv_name))

        rv_path = os.path.join(temp_dir, rv_name)
        # Content
        plugin_name = src_repo["pulp_type"].split(".")[0]
        cfg = get_plugin_config(plugin_name)

        resulting_content_ids = []
        for res_class in cfg.exportable_classes:
            filename = f"{res_class.__module__}.{res_class.__name__}.json"
            a_result = _import_file(os.path.join(rv_path, filename),
                                    res_class,
                                    do_raise=False)
            # django import-export can have a problem with concurrent-imports that are
            # importing the same 'thing' (e.g., a Package that exists in two different
            # repo-versions that are being imported at the same time). We will try an import
            # that will simply record errors as they happen (rather than failing with an exception)
            # first. If errors happen, we'll do one retry before we give up on this repo-version's
            # import.
            if a_result.has_errors():
                log.info(
                    _("...{} import-errors encountered importing {} from {}, retrying"
                      ).format(a_result.totals["error"], filename, rv_name))
                # Second attempt, we allow to raise an exception on any problem.
                # This will either succeed, or log a fatal error and fail.
                try:
                    a_result = _import_file(os.path.join(rv_path, filename),
                                            res_class)
                except Exception as e:  # noqa log on ANY exception and then re-raise
                    log.error(
                        _("FATAL import-failure importing {} from {}").format(
                            filename, rv_name))
                    raise

            resulting_content_ids.extend(row.object_id for row in a_result.rows
                                         if row.import_type in ("new",
                                                                "update"))

        # Once all content exists, create the ContentArtifact links
        ca_path = os.path.join(rv_path, CA_FILE)
        _import_file(ca_path, ContentArtifactResource)

        # see if we have a content mapping
        mapping_path = f"{rv_name}/{CONTENT_MAPPING_FILE}"
        mapping = {}
        with tarfile.open(tar_path, "r:gz") as tar:
            if mapping_path in tar.getnames():
                tar.extract(mapping_path, path=temp_dir)
                with open(os.path.join(temp_dir, mapping_path),
                          "r") as mapping_file:
                    mapping = json.load(mapping_file)

        if mapping:
            # use the content mapping to map content to repos
            for repo_name, content_ids in mapping.items():
                repo = _destination_repo(importer, repo_name)
                content = Content.objects.filter(upstream_id__in=content_ids)
                with repo.new_version() as new_version:
                    new_version.set_content(content)
        else:
            # just map all the content to our destination repo
            content = Content.objects.filter(pk__in=resulting_content_ids)
            with dest_repo.new_version() as new_version:
                new_version.set_content(content)

        content_count = content.count()
        pb.total = content_count
        pb.done = content_count
        pb.state = TASK_STATES.COMPLETED
        pb.save()

    gpr = TaskGroup.current().group_progress_reports.filter(
        code="import.repo.versions")
    gpr.update(done=F("done") + 1)
Example #4
0
def import_repository_version(importer_pk, destination_repo_pk, source_repo_name, tar_path):
    """
    Import a repository version from a Pulp export.

    Args:
        importer_pk (str): Importer we are working with
        destination_repo_pk (str): Primary key of Repository to import into.
        source_repo_name (str): Name of the Repository in the export.
        tar_path (str): A path to export tar.
    """
    dest_repo = Repository.objects.get(pk=destination_repo_pk)
    importer = PulpImporter.objects.get(pk=importer_pk)

    pb = ProgressReport(
        message=f"Importing content for {dest_repo.name}",
        code="import.repo.version.content",
        state=TASK_STATES.RUNNING,
    )
    pb.save()

    with tempfile.TemporaryDirectory() as temp_dir:
        # Extract the repo file for the repo info
        with tarfile.open(tar_path, "r:gz") as tar:
            tar.extract(REPO_FILE, path=temp_dir)

        with open(os.path.join(temp_dir, REPO_FILE), "r") as repo_data_file:
            data = json.load(repo_data_file)

        src_repo = next(repo for repo in data if repo["name"] == source_repo_name)
        rv_path = os.path.join(temp_dir, _repo_version_path(src_repo))

        if dest_repo.pulp_type != src_repo["pulp_type"]:
            raise ValidationError(
                _(
                    "Repository type mismatch: {src_repo} ({src_type}) vs {dest_repo} "
                    "({dest_type})."
                ).format(
                    src_repo=src_repo["name"],
                    src_type=src_repo["pulp_type"],
                    dest_repo=dest_repo.name,
                    dest_type=dest_repo.pulp_type,
                )
            )

        # Extract the repo version files
        with tarfile.open(tar_path, "r:gz") as tar:
            for mem in tar.getmembers():
                if re.match(fr"^{_repo_version_path(src_repo)}/.+", mem.name):
                    tar.extract(mem, path=temp_dir)

        # Content
        plugin_name = src_repo["pulp_type"].split(".")[0]
        cfg = get_plugin_config(plugin_name)

        resulting_content_ids = []
        for res_class in cfg.exportable_classes:
            filename = f"{res_class.__module__}.{res_class.__name__}.json"
            a_result = _import_file(os.path.join(rv_path, filename), res_class)
            resulting_content_ids.extend(
                row.object_id for row in a_result.rows if row.import_type in ("new", "update")
            )

        # Once all content exists, create the ContentArtifact links
        ca_path = os.path.join(rv_path, CA_FILE)
        _import_file(ca_path, ContentArtifactResource)

        # see if we have a content mapping
        mapping_path = f"{_repo_version_path(src_repo)}/{CONTENT_MAPPING_FILE}"
        mapping = {}
        with tarfile.open(tar_path, "r:gz") as tar:
            if mapping_path in tar.getnames():
                tar.extract(mapping_path, path=temp_dir)
                with open(os.path.join(temp_dir, mapping_path), "r") as mapping_file:
                    mapping = json.load(mapping_file)

        if mapping:
            # use the content mapping to map content to repos
            for repo_name, content_ids in mapping.items():
                repo = _destination_repo(importer, repo_name)
                content = Content.objects.filter(upstream_id__in=content_ids)
                with repo.new_version() as new_version:
                    new_version.set_content(content)
        else:
            # just map all the content to our destination repo
            content = Content.objects.filter(pk__in=resulting_content_ids)
            with dest_repo.new_version() as new_version:
                new_version.set_content(content)

        content_count = content.count()
        pb.total = content_count
        pb.done = content_count
        pb.state = TASK_STATES.COMPLETED
        pb.save()

    gpr = TaskGroup.current().group_progress_reports.filter(code="import.repo.versions")
    gpr.update(done=F("done") + 1)
Example #5
0
def orphan_cleanup(content_pks=None,
                   orphan_protection_time=settings.ORPHAN_PROTECTION_TIME):
    """
    Delete all orphan Content and Artifact records.
    Go through orphan Content multiple times to remove content from subrepos.
    This task removes Artifact files from the filesystem as well.

    Kwargs:
        content_pks (list): A list of content pks. If specified, only remove these orphans.

    """
    progress_bar = ProgressReport(
        message="Clean up orphan Content",
        total=0,
        code="clean-up.content",
        done=0,
        state="running",
    )

    while True:
        content = Content.objects.orphaned(
            orphan_protection_time,
            content_pks).exclude(pulp_type=PublishedMetadata.get_pulp_type())
        content_count = content.count()
        if not content_count:
            break

        progress_bar.total += content_count
        progress_bar.save()

        # delete the content
        for c in queryset_iterator(content):
            progress_bar.increase_by(c.count())
            c.delete()

    progress_bar.state = "completed"
    progress_bar.save()

    # delete the artifacts that don't belong to any content
    artifacts = Artifact.objects.orphaned(orphan_protection_time)

    progress_bar = ProgressReport(
        message="Clean up orphan Artifacts",
        total=artifacts.count(),
        code="clean-up.content",
        done=0,
        state="running",
    )
    progress_bar.save()

    counter = 0
    interval = 100
    for artifact in artifacts.iterator():
        # we need to manually call delete() because it cleans up the file on the filesystem
        artifact.delete()
        progress_bar.done += 1
        counter += 1

        if counter >= interval:
            progress_bar.save()
            counter = 0

    progress_bar.state = "completed"
    progress_bar.save()
Example #6
0
def reclaim_space(repo_pks, keeplist_rv_pks=None, force=False):
    """
    This task frees-up disk space by removing Artifact files from the filesystem for Content
    exclusive to the list of provided repos.

    Note: content marked as `proctected` will be excluded from the reclaim disk space.

    Kwargs:
        repo_pks (list): A list of repo pks the disk reclaim space is performed on.
        keeplist_rv_pks (list): A list of repo version pks that will be excluded from the reclaim
        disk space.
        force (bool): If True, uploaded content will be taken into account.

    """
    reclaimed_repos = Repository.objects.filter(pk__in=repo_pks)
    for repo in reclaimed_repos:
        repo.invalidate_cache(everything=True)

    rest_of_repos = Repository.objects.exclude(pk__in=repo_pks)
    c_keep_qs = Content.objects.filter(repositories__in=rest_of_repos)
    c_reclaim_qs = Content.objects.filter(repositories__in=repo_pks)
    c_reclaim_qs = c_reclaim_qs.exclude(
        pk__in=c_keep_qs, pulp_type=PublishedMetadata.get_pulp_type())

    if keeplist_rv_pks:
        rv_qs = RepositoryVersion.objects.filter(pk__in=keeplist_rv_pks)
        rv_content = Content.objects.none()
        for rv in rv_qs.iterator():
            rv_content |= rv.content
        c_reclaim_qs = c_reclaim_qs.exclude(pk__in=rv_content)

    content_distinct = c_reclaim_qs.distinct("pulp_type")
    unprotected = []
    for content in content_distinct:
        if not content.cast().PROTECTED_FROM_RECLAIM:
            unprotected.append(content.pulp_type)

    ca_qs = ContentArtifact.objects.filter(
        content__in=c_reclaim_qs.values("pk"), artifact__isnull=False)
    if not force:
        ca_qs = ca_qs.filter(remoteartifact__isnull=False)
    artifact_pks = set()
    ca_to_update = []
    for ca in ca_qs.iterator():
        if ca.content.pulp_type in unprotected:
            artifact_pks.add(ca.artifact.pk)
            ca.artifact = None
            ca_to_update.append(ca)

    ContentArtifact.objects.bulk_update(objs=ca_to_update,
                                        fields=["artifact"],
                                        batch_size=1000)
    artifacts_to_delete = Artifact.objects.filter(pk__in=artifact_pks)
    progress_bar = ProgressReport(
        message="Reclaim disk space",
        total=artifacts_to_delete.count(),
        code="reclaim-space.artifact",
        done=0,
        state="running",
    )
    progress_bar.save()

    counter = 0
    interval = 100
    for artifact in artifacts_to_delete.iterator():
        # we need to manually call delete() because it cleans up the file on the filesystem
        artifact.delete()
        progress_bar.done += 1
        counter += 1

        if counter >= interval:
            progress_bar.save()
            counter = 0

    progress_bar.state = "completed"
    progress_bar.save()