def pulp_import(importer_pk, path): """ Import a Pulp export into Pulp. Args: importer_pk (str): Primary key of PulpImporter to do the import path (str): Path to the export to be imported """ def destination_repo(source_repo_name): """Find the destination repository based on source repo's name.""" if importer.repo_mapping and importer.repo_mapping.get(source_repo_name): dest_repo_name = importer.repo_mapping[source_repo_name] else: dest_repo_name = source_repo_name return Repository.objects.get(name=dest_repo_name) log.info(_("Importing {}.").format(path)) importer = PulpImporter.objects.get(pk=importer_pk) pulp_import = PulpImport.objects.create(importer=importer, task=Task.current(), params={"path": path}) CreatedResource.objects.create(content_object=pulp_import) task_group = TaskGroup.objects.create(description=f"Import of {path}") CreatedResource.objects.create(content_object=task_group) with tempfile.TemporaryDirectory() as temp_dir: with tarfile.open(path, "r:gz") as tar: tar.extractall(path=temp_dir) # Artifacts ar_result = _import_file(os.path.join(temp_dir, ARTIFACT_FILE), ArtifactResource) for row in ar_result.rows: artifact = Artifact.objects.get(pk=row.object_id) base_path = os.path.join('artifact', artifact.sha256[0:2], artifact.sha256[2:]) src = os.path.join(temp_dir, base_path) dest = os.path.join(settings.MEDIA_ROOT, base_path) if not default_storage.exists(dest): with open(src, 'rb') as f: default_storage.save(dest, f) with open(os.path.join(temp_dir, REPO_FILE), "r") as repo_data_file: data = json.load(repo_data_file) for src_repo in data: try: dest_repo = destination_repo(src_repo["name"]) except Repository.DoesNotExist: log.warn(_("Could not find destination repo for {}. " "Skipping.").format(src_repo["name"])) continue enqueue_with_reservation( import_repository_version, [dest_repo], args=[dest_repo.pk, src_repo['pulp_id'], path], task_group=task_group, )
def fs_publication_export(exporter_pk, publication_pk): """ Export a publication to the file system. Args: exporter_pk (str): FilesystemExporter pk publication_pk (str): Publication pk """ exporter = Exporter.objects.get(pk=exporter_pk).cast() publication = Publication.objects.get(pk=publication_pk).cast() export = FilesystemExport.objects.create( exporter=exporter, params={"publication": publication_pk}, task=Task.current(), ) ExportedResource.objects.create(export=export, content_object=publication) CreatedResource.objects.create(content_object=export) log.info( _("Exporting: file_system_exporter={exporter}, publication={publication}, path=path" ).format(exporter=exporter.name, publication=publication.pk, path=exporter.path)) content_artifacts = ContentArtifact.objects.filter( pk__in=publication.published_artifact.values_list( "content_artifact__pk", flat=True)) if publication.pass_through: content_artifacts |= ContentArtifact.objects.filter( content__in=publication.repository_version.content) _export_to_file_system(exporter.path, content_artifacts)
def fs_repo_version_export(exporter_pk, repo_version_pk): """ Export a repository version to the file system. Args: exporter_pk (str): FilesystemExporter pk repo_version_pk (str): RepositoryVersion pk """ exporter = Exporter.objects.get(pk=exporter_pk).cast() repo_version = RepositoryVersion.objects.get(pk=repo_version_pk) export = FilesystemExport.objects.create( exporter=exporter, params={"repository_version": repo_version_pk}, task=Task.current(), ) ExportedResource.objects.create(export=export, content_object=repo_version) CreatedResource.objects.create(content_object=export) log.info( _("Exporting: file_system_exporter={exporter}, repo_version={repo_version}, path=path" ).format(exporter=exporter.name, repo_version=repo_version.pk, path=exporter.path)) content_artifacts = ContentArtifact.objects.filter( content__in=repo_version.content) _export_to_file_system(exporter.path, content_artifacts)
def pulp_export(pulp_exporter): """ Create a PulpExport to export pulp_exporter.repositories 1) Spit out all Artifacts, ArtifactResource.json, and RepositoryResource.json 2) Spit out all *resource JSONs in per-repo-version directories 3) Compute and store the sha256 and filename of the resulting tar.gz Args: pulp_exporter (models.PulpExporter): PulpExporter instance Raises: ValidationError: When path is not in the ALLOWED_EXPORT_PATHS setting, OR path exists and is not a directory """ from pulpcore.app.serializers.exporter import ExporterSerializer ExporterSerializer.validate_path(pulp_exporter.path, check_is_dir=True) repositories = pulp_exporter.repositories.all() export = PulpExport.objects.create(exporter=pulp_exporter, task=Task.current(), params=None) tarfile_fp = export.export_tarfile_path() os.makedirs(pulp_exporter.path, exist_ok=True) with tarfile.open(tarfile_fp, 'w:gz') as tar: export.tarfile = tar CreatedResource.objects.create(content_object=export) artifacts = [] repo_versions = [] # Gather up the versions and artifacts for repo in repositories: version = repo.latest_version() # Check version-content to make sure we're not being asked to export an on_demand repo content_artifacts = ContentArtifact.objects.filter(content__in=version.content) if content_artifacts.filter(artifact=None).exists(): RuntimeError(_("Remote artifacts cannot be exported.")) repo_versions.append(version) artifacts.extend(version.artifacts.all()) from pulpcore.app.importexport import export_artifacts, export_content # Export the top-level entities (artifacts and repositories) export_artifacts(export, artifacts, pulp_exporter.last_export) # Export the repository-version data, per-version for version in repo_versions: export_content(export, version, pulp_exporter.last_export) ExportedResource.objects.create(export=export, content_object=version) sha256_hash = hashlib.sha256() with open(tarfile_fp, "rb") as f: # Read and update hash string value in blocks of 4K for byte_block in iter(lambda: f.read(4096), b""): sha256_hash.update(byte_block) export.sha256 = sha256_hash.hexdigest() export.filename = tarfile_fp export.save() pulp_exporter.last_export = export pulp_exporter.save()
def create_profile_db_and_connection(): """ Create a profile db from this tasks UUID and a sqlite3 connection to that databases. The database produced has three tables with the following SQL format: The `stages` table stores info about the pipeline itself and stores 3 fields * uuid - the uuid of the stage * name - the name of the stage * num - the number of the stage starting at 0 The `traffic` table stores 3 fields: * uuid - the uuid of the stage this queue feeds into * waiting_time - the amount of time the item is waiting in the queue before it enters the stage. * service_time - the service time the item spent in the stage. The `system` table stores 3 fields: * uuid - The uuid of stage this queue feeds into * length - The length of items in this queue, measured just before each arrival. * interarrival_time - The amount of time since the last arrival. """ debug_data_dir = "/var/lib/pulp/debug/" pathlib.Path(debug_data_dir).mkdir(parents=True, exist_ok=True) current_task = Task.current() if current_task: db_path = debug_data_dir + str(current_task.pk) else: db_path = debug_data_dir + str(uuid.uuid4()) import sqlite3 global CONN CONN = sqlite3.connect(db_path) c = CONN.cursor() # Create table c.execute( """CREATE TABLE stages (uuid varchar(36), name text, num int)""" ) # Create table c.execute( """CREATE TABLE traffic (uuid varchar(36), waiting_time real, service_time real)""" ) # Create table c.execute( """CREATE TABLE system (uuid varchar(36), length int, interarrival_time real)""" ) return CONN
def dispatch(func, resources, args=None, kwargs=None, task_group=None): """ Enqueue a message to Pulp workers with a reservation. This method provides normal enqueue functionality, while also requesting necessary locks for serialized urls. No two tasks that claim the same resource can execute concurrently. It accepts resources which it transforms into a list of urls (one for each resource). This method creates a :class:`pulpcore.app.models.Task` object and returns it. The values in `args` and `kwargs` must be JSON serializable, but may contain instances of ``uuid.UUID``. Args: func (callable): The function to be run by RQ when the necessary locks are acquired. resources (list): A list of resources to this task needs exclusive access to while running. Each resource can be either a `str` or a `django.models.Model` instance. args (tuple): The positional arguments to pass on to the task. kwargs (dict): The keyword arguments to pass on to the task. task_group (pulpcore.app.models.TaskGroup): A TaskGroup to add the created Task to. Returns (pulpcore.app.models.Task): The Pulp Task that was created. Raises: ValueError: When `resources` is an unsupported type. """ if settings.USE_NEW_WORKER_TYPE: args_as_json = json.dumps(args, cls=UUIDEncoder) kwargs_as_json = json.dumps(kwargs, cls=UUIDEncoder) resources = _validate_and_get_resources(resources) with transaction.atomic(): task = Task.objects.create( state=TASK_STATES.WAITING, logging_cid=(get_guid() or ""), task_group=task_group, name=f"{func.__module__}.{func.__name__}", args=args_as_json, kwargs=kwargs_as_json, parent_task=Task.current(), reserved_resources_record=resources, ) # Notify workers with db_connection.connection.cursor() as cursor: cursor.execute("NOTIFY pulp_worker_wakeup") return task else: RQ_job_id = _enqueue_with_reservation(func, resources=resources, args=args, kwargs=kwargs, task_group=task_group) return Task.objects.get(pk=RQ_job_id.id)
def save(self, *args, **kwargs): """ Auto-set the task_id if running inside a task If the task_id is already set it will not be updated. If it is unset and this is running inside of a task it will be auto-set prior to saving. args (list): positional arguments to be passed on to the real save kwargs (dict): keyword arguments to be passed on to the real save """ now = timezone.now() if not self.task_id: self.task = Task.current() if self._using_context_manager and self._last_save_time: if now - self._last_save_time >= datetime.timedelta( milliseconds=BATCH_INTERVAL): super().save(*args, **kwargs) self._last_save_time = now else: super().save(*args, **kwargs) self._last_save_time = now
def pulp_import(importer_pk, path, toc): """ Import a Pulp export into Pulp. Args: importer_pk (str): Primary key of PulpImporter to do the import path (str): Path to the export to be imported """ def _compute_hash(filename): sha256_hash = hashlib.sha256() with open(filename, "rb") as f: # Read and update hash string value in blocks of 4K for byte_block in iter(lambda: f.read(4096), b""): sha256_hash.update(byte_block) return sha256_hash.hexdigest() def validate_toc(toc_filename): """ Check validity of table-of-contents file. table-of-contents must: * exist * be valid JSON * point to chunked-export-files that exist 'next to' the 'toc' file * point to chunks whose checksums match the checksums stored in the 'toc' file Args: toc_filename (str): The user-provided toc-file-path to be validated. Raises: ValidationError: If toc is not a valid JSON table-of-contents file, or when toc points to chunked-export-files that can't be found in the same directory as the toc-file, or the checksums of the chunks do not match the checksums stored in toc. """ with open(toc_filename) as json_file: # Valid JSON? the_toc = json.load(json_file) if not the_toc.get("files", None) or not the_toc.get("meta", None): raise ValidationError( _("Missing 'files' or 'meta' keys in table-of-contents!")) base_dir = os.path.dirname(toc_filename) # Points at chunks that exist? missing_files = [] for f in sorted(the_toc["files"].keys()): if not os.path.isfile(os.path.join(base_dir, f)): missing_files.append(f) if missing_files: raise ValidationError( _("Missing import-chunks named in table-of-contents: {}.". format(str(missing_files)))) errs = [] # validate the sha256 of the toc-entries # gather errors for reporting at the end chunks = sorted(the_toc["files"].keys()) data = dict(message="Validating Chunks", code="validate.chunks", total=len(chunks)) with ProgressReport(**data) as pb: for chunk in pb.iter(chunks): a_hash = _compute_hash(os.path.join(base_dir, chunk)) if not a_hash == the_toc["files"][chunk]: err_str = "File {} expected checksum : {}, computed checksum : {}".format( chunk, the_toc["files"][chunk], a_hash) errs.append(err_str) # if there are any errors, report and fail if errs: raise ValidationError( _("Import chunk hash mismatch: {}).").format(str(errs))) return the_toc def validate_and_assemble(toc_filename): """Validate checksums of, and reassemble, chunks in table-of-contents file.""" the_toc = validate_toc(toc_filename) toc_dir = os.path.dirname(toc_filename) result_file = os.path.join(toc_dir, the_toc["meta"]["file"]) # if we have only one entry in "files", it must be the full .tar.gz - return it if len(the_toc["files"]) == 1: return os.path.join(toc_dir, list(the_toc["files"].keys())[0]) # We have multiple chunks. # reassemble into one file 'next to' the toc and return the resulting full-path chunk_size = int(the_toc["meta"]["chunk_size"]) offset = 0 block_size = 1024 blocks_per_chunk = int(chunk_size / block_size) # sorting-by-filename is REALLY IMPORTANT here # keys are of the form <base-export-name>.00..<base-export-name>.NN, # and must be reassembled IN ORDER the_chunk_files = sorted(the_toc["files"].keys()) data = dict(message="Recombining Chunks", code="recombine.chunks", total=len(the_chunk_files)) with ProgressReport(**data) as pb: for chunk in pb.iter(the_chunk_files): # For each chunk, add it to the reconstituted tar.gz, picking up where the previous # chunk left off subprocess.run([ "dd", "if={}".format(os.path.join(toc_dir, chunk)), "of={}".format(result_file), "bs={}".format(str(block_size)), "seek={}".format(str(offset)), ], ) offset += blocks_per_chunk # To keep from taking up All The Disk, we delete each chunk after it has been added # to the recombined file. try: subprocess.run(["rm", "-f", os.path.join(toc_dir, chunk)]) except OSError: log.warning( _("Failed to remove chunk {} after recombining. Continuing." ).format(os.path.join(toc_dir, chunk)), exc_info=True, ) combined_hash = _compute_hash(result_file) if combined_hash != the_toc["meta"]["global_hash"]: raise ValidationError( _("Mismatch between combined .tar.gz checksum [{}] and originating [{}])." ).format(combined_hash, the_toc["meta"]["global_hash"])) # if we get this far, then: the chunk-files all existed, they all pass checksum validation, # and there exists a combined .tar.gz, which *also* passes checksum-validation. # Let the rest of the import process do its thing on the new combined-file. return result_file if toc: log.info(_("Validating TOC {}.").format(toc)) path = validate_and_assemble(toc) log.info(_("Importing {}.").format(path)) current_task = Task.current() importer = PulpImporter.objects.get(pk=importer_pk) the_import = PulpImport.objects.create(importer=importer, task=current_task, params={"path": path}) CreatedResource.objects.create(content_object=the_import) task_group = TaskGroup.objects.create(description=f"Import of {path}") Task.objects.filter(pk=current_task.pk).update(task_group=task_group) current_task.refresh_from_db() CreatedResource.objects.create(content_object=task_group) with tempfile.TemporaryDirectory() as temp_dir: with tarfile.open(path, "r:gz") as tar: tar.extractall(path=temp_dir) # Check version info with open(os.path.join(temp_dir, VERSIONS_FILE)) as version_file: version_json = json.load(version_file) _check_versions(version_json) # Artifacts ar_result = _import_file(os.path.join(temp_dir, ARTIFACT_FILE), ArtifactResource) data = dict(message="Importing Artifacts", code="import.artifacts", total=len(ar_result.rows)) with ProgressReport(**data) as pb: for row in pb.iter(ar_result.rows): artifact = Artifact.objects.get(pk=row.object_id) base_path = os.path.join("artifact", artifact.sha256[0:2], artifact.sha256[2:]) src = os.path.join(temp_dir, base_path) dest = os.path.join(settings.MEDIA_ROOT, base_path) if not default_storage.exists(dest): with open(src, "rb") as f: default_storage.save(dest, f) with open(os.path.join(temp_dir, REPO_FILE), "r") as repo_data_file: data = json.load(repo_data_file) gpr = GroupProgressReport( message="Importing repository versions", code="import.repo.versions", total=len(data), done=0, task_group=task_group, ) gpr.save() for src_repo in data: try: dest_repo = _destination_repo(importer, src_repo["name"]) except Repository.DoesNotExist: log.warning( _("Could not find destination repo for {}. Skipping."). format(src_repo["name"])) continue dispatch( import_repository_version, [dest_repo], args=[importer.pk, dest_repo.pk, src_repo["name"], path], task_group=task_group, ) task_group.finish()
def pulp_export(the_export): """ Create a PulpExport to export pulp_exporter.repositories. 1) Spit out all Artifacts, ArtifactResource.json, and RepositoryResource.json 2) Spit out all *resource JSONs in per-repo-version directories 3) Compute and store the sha256 and filename of the resulting tar.gz/chunks Args: the_export (models.PulpExport): PulpExport instance Raises: ValidationError: When path is not in the ALLOWED_EXPORT_PATHS setting, OR path exists and is not a directory """ try: pulp_exporter = the_export.exporter the_export.task = Task.current() tarfile_fp = the_export.export_tarfile_path() path = Path(pulp_exporter.path) if not path.is_dir(): path.mkdir(mode=0o775, parents=True) rslts = {} if the_export.validated_chunk_size: # write it into chunks with subprocess.Popen( [ "split", "-a", "4", "-b", str(the_export.validated_chunk_size), "-d", "-", tarfile_fp + ".", ], stdin=subprocess.PIPE, ) as split_process: try: with tarfile.open(tarfile_fp, "w|gz", fileobj=split_process.stdin) as tar: _do_export(pulp_exporter, tar, the_export) except Exception: # no matter what went wrong, we can't trust the files we (may have) created. # Delete the ones we can find and pass the problem up. for pathname in glob(tarfile_fp + ".*"): os.remove(pathname) raise # compute the hashes global_hash = hashlib.sha256() paths = sorted([str(Path(p)) for p in glob(tarfile_fp + ".*")]) for a_file in paths: a_hash = _compute_hash(a_file, global_hash) rslts[a_file] = a_hash tarfile_hash = global_hash.hexdigest() else: # write into the file try: with tarfile.open(tarfile_fp, "w:gz") as tar: _do_export(pulp_exporter, tar, the_export) except Exception: # no matter what went wrong, we can't trust the file we created. # Delete it if it exists and pass the problem up. if os.path.exists(tarfile_fp): os.remove(tarfile_fp) raise # compute the hash tarfile_hash = _compute_hash(tarfile_fp) rslts[tarfile_fp] = tarfile_hash # store the outputfile/hash info the_export.output_file_info = rslts # write outputfile/hash info to a file 'next to' the output file(s) output_file_info_path = tarfile_fp.replace(".tar.gz", "-toc.json") with open(output_file_info_path, "w") as outfile: if the_export.validated_chunk_size: chunk_size = the_export.validated_chunk_size else: chunk_size = 0 chunk_toc = { "meta": { "chunk_size": chunk_size, "file": os.path.basename(tarfile_fp), "global_hash": tarfile_hash, }, "files": {}, } # Build a toc with just filenames (not the path on the exporter-machine) for a_path in rslts.keys(): chunk_toc["files"][os.path.basename(a_path)] = rslts[a_path] json.dump(chunk_toc, outfile) # store toc info toc_hash = _compute_hash(output_file_info_path) the_export.output_file_info[output_file_info_path] = toc_hash the_export.toc_info = {"file": output_file_info_path, "sha256": toc_hash} finally: # whatever may have happened, make sure we save the export the_export.save() # If an exception was thrown, we'll never get here - which is good, because we don't want a # 'failed' export to be the last_export we derive the next incremental from # mark it as 'last' pulp_exporter.last_export = the_export # save the exporter pulp_exporter.save()
def pulp_import(importer_pk, path): """ Import a Pulp export into Pulp. Args: importer_pk (str): Primary key of PulpImporter to do the import path (str): Path to the export to be imported """ def import_file(fpath, resource_class): log.info(_("Importing file {}.").format(fpath)) with open(fpath, "r") as json_file: data = Dataset().load(json_file.read(), format="json") resource = resource_class() return resource.import_data(data, raise_errors=True) def destination_repo(source_repo_name): """Find the destination repository based on source repo's name.""" if importer.repo_mapping and importer.repo_mapping.get( source_repo_name): dest_repo_name = importer.repo_mapping[source_repo_name] else: dest_repo_name = source_repo_name return Repository.objects.get(name=dest_repo_name) def repo_version_path(temp_dir, src_repo): """Find the repo version path in the export based on src_repo json.""" src_repo_version = int(src_repo["next_version"]) - 1 return os.path.join( temp_dir, f"repository-{src_repo['pulp_id']}_{src_repo_version}") log.info(_("Importing {}.").format(path)) importer = PulpImporter.objects.get(pk=importer_pk) pulp_import = PulpImport.objects.create(importer=importer, task=Task.current(), params={"path": path}) CreatedResource.objects.create(content_object=pulp_import) with tempfile.TemporaryDirectory() as temp_dir: with tarfile.open(path, "r|gz") as tar: tar.extractall(path=temp_dir) # Artifacts ar_result = import_file(os.path.join(temp_dir, ARTIFACT_FILE), ArtifactResource) for row in ar_result.rows: artifact = Artifact.objects.get(pk=row.object_id) base_path = os.path.join('artifact', artifact.sha256[0:2], artifact.sha256[2:]) src = os.path.join(temp_dir, base_path) dest = os.path.join(settings.MEDIA_ROOT, base_path) if not default_storage.exists(dest): with open(src, 'rb') as f: default_storage.save(dest, f) # Repo Versions with open(os.path.join(temp_dir, REPO_FILE), "r") as repo_data_file: data = json.load(repo_data_file) for src_repo in data: try: dest_repo = destination_repo(src_repo["name"]) except Repository.DoesNotExist: log.warn( _("Could not find destination repo for {}. " "Skipping.").format(src_repo["name"])) continue rv_path = repo_version_path(temp_dir, src_repo) # Untyped Content content_path = os.path.join(rv_path, CONTENT_FILE) c_result = import_file(content_path, ContentResource) content = Content.objects.filter( pk__in=[r.object_id for r in c_result.rows]) # Content Artifacts ca_path = os.path.join(rv_path, CA_FILE) import_file(ca_path, ContentArtifactResource) # Content plugin_name = src_repo["pulp_type"].split('.')[0] cfg = get_plugin_config(plugin_name) for res_class in cfg.exportable_classes: filename = f"{res_class.__module__}.{res_class.__name__}.json" import_file(os.path.join(rv_path, filename), res_class) # Create the repo version with dest_repo.new_version() as new_version: new_version.set_content(content) return importer
def pulp_export(the_export): """ Create a PulpExport to export pulp_exporter.repositories. 1) Spit out all Artifacts, ArtifactResource.json, and RepositoryResource.json 2) Spit out all *resource JSONs in per-repo-version directories 3) Compute and store the sha256 and filename of the resulting tar.gz/chunks Args: the_export (models.PulpExport): PulpExport instance Raises: ValidationError: When path is not in the ALLOWED_EXPORT_PATHS setting, OR path exists and is not a directory """ pulp_exporter = the_export.exporter the_export.task = Task.current() tarfile_fp = the_export.export_tarfile_path() os.makedirs(pulp_exporter.path, exist_ok=True) rslts = {} if the_export.validated_chunk_size: # write it into chunks with subprocess.Popen( [ "split", "-a", "4", "-b", str(the_export.validated_chunk_size), "-d", "-", tarfile_fp + ".", ], stdin=subprocess.PIPE, ) as split_process: with tarfile.open(tarfile_fp, "w|gz", fileobj=split_process.stdin) as tar: _do_export(pulp_exporter, tar, the_export) # compute the hashes paths = [str(Path(p)) for p in glob(tarfile_fp + ".*")] for a_file in paths: a_hash = _compute_hash(a_file) rslts[a_file] = a_hash else: # write into the file with tarfile.open(tarfile_fp, "w:gz") as tar: _do_export(pulp_exporter, tar, the_export) # compute the hash tarfile_hash = _compute_hash(tarfile_fp) rslts[tarfile_fp] = tarfile_hash # store the outputfile/hash info the_export.output_file_info = rslts # save the export the_export.save() # mark it as 'last' pulp_exporter.last_export = the_export # save the exporter pulp_exporter.save()