def from_local_repomd(cls, repomd_path): """Create OriginRepo object from the local repomd.xml. @param path path to the repomd.xml""" repomd = cr.Repomd(repomd_path) repo = cls() repo._fill_from_repomd_object(repomd) return repo
async def run(self): """Build `DeclarativeContent` from the repodata.""" self.data.remote_url = self.new_url or self.remote.url progress_data = dict(message="Downloading Metadata Files", code="downloading.metadata") with ProgressReport(**progress_data) as metadata_pb: self.data.metadata_pb = metadata_pb downloader = self.remote.get_downloader( url=urljoin(self.data.remote_url, "repodata/repomd.xml")) result = await downloader.run() metadata_pb.increment() repomd_path = result.path self.data.repomd = cr.Repomd(repomd_path) self.repository.last_sync_revision_number = self.data.repomd.revision self.repository.last_sync_repomd_checksum = get_sha256(repomd_path) await self.parse_distribution_tree() await self.parse_repository_metadata() await self.parse_modules_metadata() await self.parse_packages_components() await self.parse_content() # now send modules down the pipeline since all relations have been set up for modulemd in self.data.modulemd_list: await self.put(modulemd) for dc_group in self.data.dc_groups: await self.put(dc_group)
def calculate_contenthash(path): if not os.path.isdir(path) or \ not os.path.isdir(os.path.join(path, "repodata/")): raise AttributeError("Not a repo: {0}".format(path)) repomd_path = os.path.join(path, "repodata/repomd.xml") repomd = cr.Repomd(repomd_path) primary_path = None for rec in repomd.records: if rec.type == "primary": primary_path = rec.location_href break if not primary_path: raise CalculationException("primary metadata are missing") pkgids = [] def pkgcb(pkg): pkgids.append("{0}{1}{2}".format(pkg.pkgId, pkg.location_href, pkg.location_base or '')) cr.xml_parse_primary(os.path.join(path, primary_path), pkgcb=pkgcb) contenthash = hashlib.new("sha256") for pkgid in sorted(pkgids): contenthash.update(pkgid.encode('utf-8')) return contenthash.hexdigest()
def insert_in_repo(comp_type, repodata, filetype, extension, source): """ Inject a file into the repodata with the help of createrepo_c. Args: comp_type (int): createrepo_c compression type indication. repodata (basestring): The path to the repo where the metadata will be inserted. filetype (basestring): What type of metadata will be inserted by createrepo_c. This does allow any string to be inserted (custom types). There are some types which are used with dnf repos as primary, updateinfo, comps, filelist etc. extension (basestring): The file extension (xml, sqlite). source (basestring): A file path. File holds the dump of metadata until copied to the repodata folder. """ log.info('Inserting %s.%s into %s', filetype, extension, repodata) target_fname = os.path.join(repodata, '%s.%s' % (filetype, extension)) shutil.copyfile(source, target_fname) repomd_xml = os.path.join(repodata, 'repomd.xml') repomd = cr.Repomd(repomd_xml) # create a new record for our repomd.xml rec = cr.RepomdRecord(filetype, target_fname) # compress our metadata file with the comp_type rec_comp = rec.compress_and_fill(cr.SHA256, comp_type) # add hash to the compresed metadata file rec_comp.rename_file() # set type of metadata rec_comp.type = filetype # insert metadata about our metadata in repomd.xml repomd.set_record(rec_comp) with open(repomd_xml, 'w') as repomd_file: repomd_file.write(repomd.xml_dump()) os.unlink(target_fname)
def test_repomd_with_path_in_constructor_repo01(self): repomd = cr.Repomd(REPO_01_REPOMD) self.assertEqual(repomd.revision, "1334667230") self.assertEqual(repomd.repo_tags, []) self.assertEqual(repomd.distro_tags, []) self.assertEqual(repomd.content_tags, []) self.assertEqual(len(repomd.records), 3)
def test_xml_parser_repomd_repo01_nowarningcb(self): repomd = cr.Repomd() cr.xml_parse_repomd(REPO_01_REPOMD, repomd) self.assertEqual(repomd.revision, "1334667230") self.assertEqual(repomd.repo_tags, []) self.assertEqual(repomd.distro_tags, []) self.assertEqual(repomd.content_tags, []) self.assertEqual(len(repomd.records), 3)
def save(self): self.primary.xml.close() self.filelists.xml.close() self.other.xml.close() repomd = createrepo_c.Repomd() repomd.set_record(self.primary.get_record()) repomd.set_record(self.filelists.get_record()) repomd.set_record(self.other.get_record()) with (self.path / 'repomd.xml').open(mode='w') as f: f.write(repomd.xml_dump())
def test_repomd_indexing_and_iteration_repo01(self): repomd = cr.Repomd(REPO_01_REPOMD) types = [] for rec in repomd: types.append(rec.type) self.assertEqual(types, ['filelists', 'other', 'primary']) rec = repomd["primary"] self.assertEqual(rec.type, "primary") self.assertRaises(KeyError, repomd.__getitem__, "foobar") self.assertTrue("primary" in repomd)
def modifyrepo(filename, repodata): repodata = os.path.join(repodata, 'repodata') uinfo_xml = os.path.join(repodata, os.path.basename(filename)) shutil.copyfile(filename, uinfo_xml) uinfo_rec = cr.RepomdRecord('updateinfo', uinfo_xml) uinfo_rec.fill(cr.SHA256) uinfo_rec.rename_file() repomd_xml = os.path.join(repodata, 'repomd.xml') repomd = cr.Repomd(repomd_xml) repomd.set_record(uinfo_rec) with file(repomd_xml, 'w') as repomd_file: repomd_file.write(repomd.xml_dump())
def _fill_from_path(self, path, contenthash=True, contenthash_type="sha256"): """Fill attributes from a repository specified by path. :param path: Path to repository (a dir that contains repodata/ subdirectory) :type path: str :param contenthash: Do content hash calculation (primary metadata must be available in the repo) :type contenthash: bool :param contenthash_type: type of the calculated content hash :type contenthash_type: str """ if not os.path.isdir(path) or \ not os.path.isdir(os.path.join(path, "repodata/")) or \ not os.path.isfile(os.path.join(path, "repodata/repomd.xml")): raise DeltaRepoError("Not a repository: {0}".format(path)) repomd_path = os.path.join(path, "repodata/repomd.xml") repomd = cr.Repomd(repomd_path) self.repomd_contenthash = repomd.contenthash self.repomd_contenthash_type = repomd.contenthash_type self._fill_from_repomd_object(repomd) # Find a primary path primary_path = None for rec in repomd.records: md_path = os.path.join(path, rec.location_href) if os.path.isfile(md_path): self.present_metadata.append(rec.type) if rec.type == "primary": primary_path = md_path if contenthash: if not primary_path: raise DeltaRepoError("{0} - primary metadata are missing" "".format(primary_path)) self.contenthash = calculate_content_hash(primary_path, contenthash_type) self.contenthash_type = contenthash_type self.path = path self.repodata = os.path.join(path, "repodata") self.basename = os.path.basename(path) self.repomd_size = os.path.getsize(repomd_path)
def modifyrepo(self, filename): """Inject a file into the repodata for each architecture""" for arch in os.listdir(self.repo_path): repodata = os.path.join(self.repo_path, arch, 'repodata') log.info('Inserting %s into %s', filename, repodata) uinfo_xml = os.path.join(repodata, 'updateinfo.xml') shutil.copyfile(filename, uinfo_xml) repomd_xml = os.path.join(repodata, 'repomd.xml') repomd = cr.Repomd(repomd_xml) uinfo_rec = cr.RepomdRecord('updateinfo', uinfo_xml) uinfo_rec_comp = uinfo_rec.compress_and_fill(self.hash_type, self.comp_type) uinfo_rec_comp.rename_file() uinfo_rec_comp.type = 'updateinfo' repomd.set_record(uinfo_rec_comp) with file(repomd_xml, 'w') as repomd_file: repomd_file.write(repomd.xml_dump()) os.unlink(uinfo_xml)
def hande_repomd(args, merger, repomd_filename): try: repomd = cr.Repomd(repomd_filename) except (RuntimeError, ValueError) as err: if not args.ignore_no_input: raise err logging.debug("{}: error loading repomd.xml: {}".format( repomd_filename, str(err))) return False # repomd was loaded and decoded successfully modules_path = False for record in repomd.records: if record.type == "modules": modules_path = record.location_href if not modules_path: logging.debug("{fn}: no modules section found in repomd.xml".format( fn=repomd_filename)) if not args.ignore_no_input: raise ValueError('{fn} does not contain a modules section'.format( fn=repomd_filename)) return False # strip repodata-prefix-dir from location_href filename = os.path.join(os.path.dirname(repomd_filename), os.path.basename(modules_path)) if os.path.isfile(filename): return merge_file(merger, filename) filename = os.path.join(os.path.dirname(repomd_filename), "../", modules_path) if os.path.isfile(filename): return merge_file(merger, filename) logging.debug( "{fn}: modules section found in repomd.xml, but href file {href} does not exist" .format(fn=repomd_filename, href=filename)) if not args.ignore_no_input: raise ValueError( "{fn}: modules section found in repomd.xml, but href file {href} does " "not exist".format(fn=repomd_filename, href=filename)) return False
def is_optimized_sync(repository, remote, url): """ Check whether it is possible to optimize the synchronization or not. Caution: we are not storing when the remote was last updated, so the order of this logic must remain in this order where we first check the version number as other changes than sync could have taken place such that the date or repo version will be different from last sync. Args: repository(RpmRepository): An RpmRepository to check optimization for. remote(RpmRemote): An RPMRemote to check optimization for. url(str): A remote repository URL. Returns: bool: True, if sync is optimized; False, otherwise. """ with WorkingDirectory(): result = get_repomd_file(remote, url) if not result: return False repomd_path = result.path repomd = cr.Repomd(repomd_path) repomd_checksum = get_sha256(repomd_path) is_optimized = ( repository.last_sync_remote and remote.pk == repository.last_sync_remote.pk and repository.last_sync_repo_version == repository.latest_version().number and remote.pulp_last_updated <= repository.latest_version().pulp_created and is_previous_version(repomd.revision, repository.last_sync_revision_number) and repository.last_sync_repomd_checksum == repomd_checksum) if is_optimized: optimize_data = dict(message="Optimizing Sync", code="optimizing.sync") with ProgressReport(**optimize_data) as optimize_pb: optimize_pb.done = 1 optimize_pb.save() return is_optimized
def get_repomd_record_xml_path(repo_path, record_type): """ Returns a file path of the specified repomd record. Parameters ---------- repo_path : str Repository path. Returns ------- str or None primary.xml file path or None if a record is not found in the repository metadata. """ repomd_path = os.path.join(repo_path, 'repodata/repomd.xml') repomd = createrepo_c.Repomd(repomd_path) for rec in repomd.records: if rec.type == record_type: return os.path.join(repo_path, rec.location_href)
def third_method(): """Parsing main metadata types (primary, filelists, other) at the same time. This approach significantly reduces memory footprint because we don't need to keep all the packages in memory, user can handle them one by one. The API reflects xml_parse_primary/filelists/other except that it handles all of them at the same time. """ def warningcb(warning_type, message): print("PARSER WARNING: %s" % message) return True repomd = cr.Repomd() cr.xml_parse_repomd(os.path.join(REPO_PATH, "repodata/repomd.xml"), repomd, warningcb) primary_xml_path = None filelists_xml_path = None other_xml_path = None for record in repomd.records: if record.type == "primary": primary_xml_path = os.path.join(REPO_PATH, record.location_href) elif record.type == "filelists": filelists_xml_path = os.path.join(REPO_PATH, record.location_href) elif record.type == "other": other_xml_path = os.path.join(REPO_PATH, record.location_href) # # Main XML metadata parsing (primary, filelists, other) # def pkgcb(pkg): # Called when whole package entry from all 3 metadata xml files is parsed print_package_info(pkg) cr.xml_parse_main_metadata_together(primary_xml_path, filelists_xml_path, other_xml_path, None, pkgcb, warningcb, False)
def insert_in_repo(comp_type, repodata, filetype, extension, source, zchunk): """ Inject a file into the repodata with the help of createrepo_c. Args: comp_type (int): createrepo_c compression type indication. repodata (str): The path to the repo where the metadata will be inserted. filetype (str): What type of metadata will be inserted by createrepo_c. This does allow any string to be inserted (custom types). There are some types which are used with dnf repos as primary, updateinfo, comps, filelist etc. extension (str): The file extension (xml, sqlite). source (str): A file path. File holds the dump of metadata until copied to the repodata folder. zchunk (bool): Whether zchunk data is supported for clients of this repo. """ log.info('Inserting %s.%s into %s', filetype, extension, repodata) target_fname = os.path.join(repodata, '%s.%s' % (filetype, extension)) shutil.copyfile(source, target_fname) repomd_xml = os.path.join(repodata, 'repomd.xml') repomd = cr.Repomd(repomd_xml) add_list = [(filetype, comp_type)] if zchunk and hasattr( cr, 'ZCK_COMPRESSION') and comp_type != cr.ZCK_COMPRESSION: add_list.append((filetype + "_zck", cr.ZCK_COMPRESSION)) for (ft, ct) in add_list: # create a new record for our repomd.xml rec = cr.RepomdRecord(ft, target_fname) # compress our metadata file with the comp_type rec_comp = rec.compress_and_fill(cr.SHA256, ct) # add hash to the compressed metadata file rec_comp.rename_file() # set type of metadata rec_comp.type = ft # insert metadata about our metadata in repomd.xml repomd.set_record(rec_comp) with open(repomd_xml, 'w') as repomd_file: repomd_file.write(repomd.xml_dump()) os.unlink(target_fname)
def streaming_iterator(): """Parsing main metadata types (primary, filelists, other) at the same time. This approach significantly reduces memory footprint because we don't need to keep all the packages in memory, user can handle them one by one. This is the most flexible method, and the recommended one if you need all of the RPM metadata. If you only need to parse one file it might not be the most efficient. """ def warningcb(warning_type, message): print("PARSER WARNING: %s" % message) return True repomd = cr.Repomd() cr.xml_parse_repomd(os.path.join(REPO_PATH, "repodata/repomd.xml"), repomd, warningcb) primary_xml_path = None filelists_xml_path = None other_xml_path = None for record in repomd.records: if record.type == "primary": primary_xml_path = os.path.join(REPO_PATH, record.location_href) elif record.type == "filelists": filelists_xml_path = os.path.join(REPO_PATH, record.location_href) elif record.type == "other": other_xml_path = os.path.join(REPO_PATH, record.location_href) # # Main XML metadata parsing (primary, filelists, other) # package_iterator = cr.PackageIterator(primary_path=primary_xml_path, filelists_path=filelists_xml_path, other_path=other_xml_path, warningcb=warningcb) for pkg in package_iterator: # Called when whole package entry from all 3 metadata xml files is parsed print_package_info(pkg)
def parse_repodata(path): """ Return a list of packages included in this repository """ try: repomd = cr.Repomd(os.path.join(path, "repodata/repomd.xml")) except OSError as e: logging.error(e) exit(2) for record in repomd.records: if record.type == "primary": primary_xml_path = record.location_href def warningcb(warning_type, message): """Optional callback for warnings about wierd stuff and formatting in XML. :param warning_type: Integer value. One from the XML_WARNING_* constants. :param message: String message. """ logging.warning("PARSER WARNING: %s" % message) return True packages = [] def pkgcb(pkg): # Called when whole package entry in xml is parsed packages.append(pkg) cr.xml_parse_primary(os.path.join(path, primary_xml_path), pkgcb=pkgcb, do_files=False, warningcb=warningcb) return packages
def parse_repomd(path): repomd = cr.Repomd(path) print "Revision:", repomd.revision if repomd.contenthash: print "Contenthash:", repomd.contenthash print "Contenthash type:", repomd.contenthash_type print "Repo tags:", repomd.repo_tags print "Content tags:", repomd.content_tags print "Distro tags:", repomd.distro_tags print for rec in repomd.records: print "Type:", rec.type print "Location href:", rec.location_href print "Location base:", rec.location_base print "Checksum:", rec.checksum print "Checksum type:", rec.checksum_type print "Checksum open:", rec.checksum_open print "Checksum open type:", rec.checksum_open_type print "Timestamp:", rec.timestamp print "Size:", rec.size print "Size open:", rec.size_open if rec.db_ver: print "Db version:", rec.db_ver print
def parse_repomd(path): repomd = cr.Repomd(path) print("Revision:", repomd.revision) if repomd.contenthash: print("Contenthash:", repomd.contenthash) print("Contenthash type:", repomd.contenthash_type) print("Repo tags:", repomd.repo_tags) print("Content tags:", repomd.content_tags) print("Distro tags:", repomd.distro_tags) print() for rec in repomd.records: print("Type:", rec.type) print("Location href:", rec.location_href) print("Location base:", rec.location_base) print("Checksum:", rec.checksum) print("Checksum type:", rec.checksum_type) print("Checksum open:", rec.checksum_open) print("Checksum open type:", rec.checksum_open_type) print("Timestamp:", rec.timestamp) print("Size:", rec.size) print("Size open:", rec.size_open) if rec.db_ver: print("Db version:", rec.db_ver) print()
def deltareposrecord_from_repopath(path, prefix_to_strip=None, logger=None): """Create DeltaRepoRecord object from a delta repository :param path: Path to a directory were a deltarepo lives :type path: str :param prefix_to_strip: Path prefix to strip from a path in the record :type prefix_to_strip: str or None :param logger: A logger :type logger: logging.Logger or None """ # Prepare paths path = os.path.abspath(path) stripped_path = path if prefix_to_strip: abs_prefix_to_strip = os.path.abspath(prefix_to_strip) if path.startswith(abs_prefix_to_strip): stripped_path = os.path.relpath(path, abs_prefix_to_strip) # Parse repomd.xml of the delta repo repomd_path = os.path.join(path, "repodata/repomd.xml") repomd = cr.Repomd(repomd_path) deltametadata_path = None for repomd_rec in repomd.records: if repomd_rec.type == "deltametadata" and repomd_rec.location_href: deltametadata_path = os.path.join(path, repomd_rec.location_href) if not deltametadata_path: raise DeltaRepoError("Not a delta repository: {0}".format(path)) # Parse deltametadata.xml of the delta repo dm = deltarepo.DeltaMetadata() dm.load(deltametadata_path) # Prepare DeltaRepoRecord aka <deltarepo> rec = deltarepo.DeltaRepoRecord() rec.location_base = None rec.location_href = stripped_path rec.revision_src = dm.revision_src rec.revision_dst = dm.revision_dst rec.contenthash_src = dm.contenthash_src rec.contenthash_dst = dm.contenthash_dst rec.contenthash_type = dm.contenthash_type rec.timestamp_src = dm.timestamp_src rec.timestamp_dst = dm.timestamp_dst # Parepare <data> elements with info about files in the repo for repomd_rec in repomd.records: if not repomd_rec.type: continue if isnonnegativeint(repomd_rec.size): rec.set_data(repomd_rec.type, repomd_rec.size) elif isnonnegativeint(repomd_rec.open_size): rec.set_data(repomd_rec.type, repomd_rec.open_size) # Collect info about repomd.xml file of the delta repo rec.repomd_timestamp = int(os.path.getmtime(repomd_path)) rec.repomd_size = os.path.getsize(repomd_path) checksumval = compute_file_checksum(repomd_path) rec.repomd_checksums = [("sha256", checksumval)] return rec
def publish(repository_version_pk): """ Create a Publication based on a RepositoryVersion. Args: repository_version_pk (str): Create a publication from this repository version. """ repository_version = RepositoryVersion.objects.get( pk=repository_version_pk) log.info( _('Publishing: repository={repo}, version={version}').format( repo=repository_version.repository.name, version=repository_version.number, )) with WorkingDirectory(): with RpmPublication.create(repository_version) as publication: packages = populate(publication) # Prepare metadata files repomd_path = os.path.join(os.getcwd(), "repomd.xml") pri_xml_path = os.path.join(os.getcwd(), "primary.xml.gz") fil_xml_path = os.path.join(os.getcwd(), "filelists.xml.gz") oth_xml_path = os.path.join(os.getcwd(), "other.xml.gz") pri_db_path = os.path.join(os.getcwd(), "primary.sqlite") fil_db_path = os.path.join(os.getcwd(), "filelists.sqlite") oth_db_path = os.path.join(os.getcwd(), "other.sqlite") upd_xml_path = os.path.join(os.getcwd(), "updateinfo.xml.gz") pri_xml = cr.PrimaryXmlFile(pri_xml_path) fil_xml = cr.FilelistsXmlFile(fil_xml_path) oth_xml = cr.OtherXmlFile(oth_xml_path) pri_db = cr.PrimarySqlite(pri_db_path) fil_db = cr.FilelistsSqlite(fil_db_path) oth_db = cr.OtherSqlite(oth_db_path) upd_xml = cr.UpdateInfoXmlFile(upd_xml_path) pri_xml.set_num_of_pkgs(len(packages)) fil_xml.set_num_of_pkgs(len(packages)) oth_xml.set_num_of_pkgs(len(packages)) # Process all packages for package in packages: pkg = package.to_createrepo_c() pkg.location_href = package.contentartifact_set.first( ).relative_path pri_xml.add_pkg(pkg) fil_xml.add_pkg(pkg) oth_xml.add_pkg(pkg) pri_db.add_pkg(pkg) fil_db.add_pkg(pkg) oth_db.add_pkg(pkg) # Process update records for update_record in UpdateRecord.objects.filter( pk__in=publication.repository_version.content): upd_xml.add_chunk(update_record_xml(update_record)) pri_xml.close() fil_xml.close() oth_xml.close() upd_xml.close() repomd = cr.Repomd() repomdrecords = (("primary", pri_xml_path, pri_db), ("filelists", fil_xml_path, fil_db), ("other", oth_xml_path, oth_db), ("primary_db", pri_db_path, None), ("filelists_db", fil_db_path, None), ("other_db", oth_db_path, None), ("updateinfo", upd_xml_path, None)) sqlite_files = ("primary_db", "filelists_db", "other_db") for name, path, db_to_update in repomdrecords: record = cr.RepomdRecord(name, path) if name in sqlite_files: record_bz = record.compress_and_fill(cr.SHA256, cr.BZ2) record_bz.type = name record_bz.rename_file() path = record_bz.location_href.split('/')[-1] repomd.set_record(record_bz) else: record.fill(cr.SHA256) if (db_to_update): db_to_update.dbinfo_update(record.checksum) db_to_update.close() record.rename_file() path = record.location_href.split('/')[-1] repomd.set_record(record) metadata = PublishedMetadata( relative_path=os.path.join(REPODATA_PATH, os.path.basename(path)), publication=publication, file=File(open(os.path.basename(path), 'rb'))) metadata.save() with open(repomd_path, "w") as repomd_f: repomd_f.write(repomd.xml_dump()) metadata = PublishedMetadata( relative_path=os.path.join(REPODATA_PATH, os.path.basename(repomd_path)), publication=publication, file=File(open(os.path.basename(repomd_path), 'rb'))) metadata.save()
def test_xml_parser_repomd_repo01(self): warnings = [] def warningcb(warn_type, msg): warnings.append((warn_type, msg)) repomd = cr.Repomd() cr.xml_parse_repomd(REPO_01_REPOMD, repomd, warningcb) self.assertEqual(warnings, []) self.assertEqual(repomd.revision, "1334667230") self.assertEqual(repomd.repo_tags, []) self.assertEqual(repomd.distro_tags, []) self.assertEqual(repomd.content_tags, []) self.assertEqual(len(repomd.records), 3) self.assertEqual(repomd.records[0].type, "filelists") self.assertEqual(repomd.records[0].location_real, None) self.assertEqual( repomd.records[0].location_href, "repodata/c7db035d0e6f1b2e883a7fa3229e2d2be70c05a8b8d2b57dbb5f9c1a67483b6c-filelists.xml.gz" ) self.assertEqual( repomd.records[0].checksum, "c7db035d0e6f1b2e883a7fa3229e2d2be70c05a8b8d2b57dbb5f9c1a67483b6c") self.assertEqual(repomd.records[0].checksum_type, "sha256") self.assertEqual( repomd.records[0].checksum_open, "85bc611be5d81ac8da2fe01e98ef741d243d1518fcc46ada70660020803fbf09") self.assertEqual(repomd.records[0].checksum_open_type, "sha256") self.assertEqual(repomd.records[0].timestamp, 1334667230) self.assertEqual(repomd.records[0].size, 273) self.assertEqual(repomd.records[0].size_open, 389) self.assertEqual(repomd.records[0].db_ver, 0) self.assertEqual(repomd.records[1].type, "other") self.assertEqual(repomd.records[1].location_real, None) self.assertEqual( repomd.records[1].location_href, "repodata/b752a73d9efd4006d740f943db5fb7c2dd77a8324bd99da92e86bd55a2c126ef-other.xml.gz" ) self.assertEqual( repomd.records[1].checksum, "b752a73d9efd4006d740f943db5fb7c2dd77a8324bd99da92e86bd55a2c126ef") self.assertEqual(repomd.records[1].checksum_type, "sha256") self.assertEqual( repomd.records[1].checksum_open, "da6096c924349af0c326224a33be0cdb26897fbe3d25477ac217261652449445") self.assertEqual(repomd.records[1].checksum_open_type, "sha256") self.assertEqual(repomd.records[1].timestamp, 1334667230) self.assertEqual(repomd.records[1].size, 332) self.assertEqual(repomd.records[1].size_open, 530) self.assertEqual(repomd.records[1].db_ver, 0) self.assertEqual(repomd.records[2].type, "primary") self.assertEqual(repomd.records[2].location_real, None) self.assertEqual( repomd.records[2].location_href, "repodata/6c662d665c24de9a0f62c17d8fa50622307739d7376f0d19097ca96c6d7f5e3e-primary.xml.gz" ) self.assertEqual( repomd.records[2].checksum, "6c662d665c24de9a0f62c17d8fa50622307739d7376f0d19097ca96c6d7f5e3e") self.assertEqual(repomd.records[2].checksum_type, "sha256") self.assertEqual( repomd.records[2].checksum_open, "0fc6cadf97d515e87491d24dc9712d8ddaf2226a21ae7f131ff42d71a877c496") self.assertEqual(repomd.records[2].checksum_open_type, "sha256") self.assertEqual(repomd.records[2].timestamp, 1334667230) self.assertEqual(repomd.records[2].size, 782) self.assertEqual(repomd.records[2].size_open, 2085) self.assertEqual(repomd.records[2].db_ver, 0)
async def run(self): """ Build `DeclarativeContent` from the repodata. """ remote_url = self.new_url or self.remote.url remote_url = remote_url if remote_url[-1] == "/" else f"{remote_url}/" optimize_sync = self.optimize progress_data = dict(message='Downloading Metadata Files', code='downloading.metadata') with ProgressReport(**progress_data) as metadata_pb: downloader = self.remote.get_downloader( url=urljoin(remote_url, 'repodata/repomd.xml')) # TODO: decide how to distinguish between a mirror list and a normal repo result = await downloader.run() metadata_pb.increment() repomd_path = result.path repomd = cr.Repomd(repomd_path) # Caution: we are not storing when the remote was last updated, so the order of this # logic must remain in this order where we first check the version number as other # changes than sync could have taken place such that the date or repo version will be # different from last sync if (optimize_sync and self.repository.last_sync_remote and self.remote.pk == self.repository.last_sync_remote.pk and (self.repository.last_sync_repo_version == self.repository.latest_version().number) and (self.remote.pulp_last_updated <= self.repository.latest_version().pulp_created) and is_previous_version( repomd.revision, self.repository.last_sync_revision_number)): optimize_data = dict(message='Optimizing Sync', code='optimizing.sync') with ProgressReport(**optimize_data) as optimize_pb: optimize_pb.done = 1 optimize_pb.save() return self.repository.last_sync_revision_number = repomd.revision if self.treeinfo: d_artifacts = [ DeclarativeArtifact( artifact=Artifact(), url=urljoin(remote_url, self.treeinfo["filename"]), relative_path=".treeinfo", remote=self.remote, deferred_download=False, ) ] for path, checksum in self.treeinfo["download"][ "images"].items(): artifact = Artifact(**checksum) da = DeclarativeArtifact( artifact=artifact, url=urljoin(remote_url, path), relative_path=path, remote=self.remote, deferred_download=self.deferred_download) d_artifacts.append(da) distribution_tree = DistributionTree( **self.treeinfo["distribution_tree"]) dc = DeclarativeContent(content=distribution_tree, d_artifacts=d_artifacts) dc.extra_data = self.treeinfo await self.put(dc) package_repodata_urls = {} downloaders = [] modulemd_list = list() dc_groups = [] dc_categories = [] dc_environments = [] nevra_to_module = defaultdict(dict) pkgname_to_groups = defaultdict(list) group_to_categories = defaultdict(list) group_to_environments = defaultdict(list) optionalgroup_to_environments = defaultdict(list) modulemd_results = None comps_downloader = None main_types = set() checksums = {} for record in repomd.records: checksums[record.type] = record.checksum_type.upper() if record.type in PACKAGE_REPODATA: main_types.update([record.type]) package_repodata_urls[record.type] = urljoin( remote_url, record.location_href) elif record.type in UPDATE_REPODATA: updateinfo_url = urljoin(remote_url, record.location_href) downloader = self.remote.get_downloader(url=updateinfo_url) downloaders.append([downloader.run()]) elif record.type in COMPS_REPODATA: comps_url = urljoin(remote_url, record.location_href) comps_downloader = self.remote.get_downloader( url=comps_url) elif record.type in SKIP_REPODATA: continue elif '_zck' in record.type: continue elif record.type in MODULAR_REPODATA: modules_url = urljoin(remote_url, record.location_href) modulemd_downloader = self.remote.get_downloader( url=modules_url) modulemd_results = await modulemd_downloader.run() elif record.type not in PACKAGE_DB_REPODATA: file_data = { record.checksum_type: record.checksum, "size": record.size } da = DeclarativeArtifact( artifact=Artifact(**file_data), url=urljoin(remote_url, record.location_href), relative_path=record.location_href, remote=self.remote, deferred_download=False) repo_metadata_file = RepoMetadataFile( data_type=record.type, checksum_type=record.checksum_type, checksum=record.checksum, ) dc = DeclarativeContent(content=repo_metadata_file, d_artifacts=[da]) await self.put(dc) missing_type = set(PACKAGE_REPODATA) - main_types if missing_type: raise FileNotFoundError( _("XML file(s): {filename} not found").format( filename=", ".join(missing_type))) self.repository.original_checksum_types = checksums # we have to sync module.yaml first if it exists, to make relations to packages if modulemd_results: modulemd_index = mmdlib.ModuleIndex.new() open_func = gzip.open if modulemd_results.url.endswith( '.gz') else open with open_func(modulemd_results.path, 'r') as moduleyaml: content = moduleyaml.read() module_content = content if isinstance( content, str) else content.decode() modulemd_index.update_from_string(module_content, True) modulemd_names = modulemd_index.get_module_names() or [] modulemd_all = parse_modulemd(modulemd_names, modulemd_index) # Parsing modules happens all at one time, and from here on no useful work happens. # So just report that it finished this stage. modulemd_pb_data = { 'message': 'Parsed Modulemd', 'code': 'parsing.modulemds' } with ProgressReport(**modulemd_pb_data) as modulemd_pb: modulemd_total = len(modulemd_all) modulemd_pb.total = modulemd_total modulemd_pb.done = modulemd_total for modulemd in modulemd_all: artifact = modulemd.pop('artifact') relative_path = '{}{}{}{}{}snippet'.format( modulemd[PULP_MODULE_ATTR.NAME], modulemd[PULP_MODULE_ATTR.STREAM], modulemd[PULP_MODULE_ATTR.VERSION], modulemd[PULP_MODULE_ATTR.CONTEXT], modulemd[PULP_MODULE_ATTR.ARCH]) da = DeclarativeArtifact(artifact=artifact, relative_path=relative_path, url=modules_url) modulemd_content = Modulemd(**modulemd) dc = DeclarativeContent(content=modulemd_content, d_artifacts=[da]) dc.extra_data = defaultdict(list) # dc.content.artifacts are Modulemd artifacts for artifact in dc.content.artifacts: nevra_to_module.setdefault(artifact, set()).add(dc) modulemd_list.append(dc) # delete list now that we're done with it for memory savings del modulemd_all modulemd_default_names = parse_defaults(modulemd_index) # Parsing module-defaults happens all at one time, and from here on no useful # work happens. So just report that it finished this stage. modulemd_defaults_pb_data = { 'message': 'Parsed Modulemd-defaults', 'code': 'parsing.modulemd_defaults' } with ProgressReport( **modulemd_defaults_pb_data) as modulemd_defaults_pb: modulemd_defaults_total = len(modulemd_default_names) modulemd_defaults_pb.total = modulemd_defaults_total modulemd_defaults_pb.done = modulemd_defaults_total for default in modulemd_default_names: artifact = default.pop('artifact') relative_path = '{}{}snippet'.format( default[PULP_MODULEDEFAULTS_ATTR.MODULE], default[PULP_MODULEDEFAULTS_ATTR.STREAM]) da = DeclarativeArtifact(artifact=artifact, relative_path=relative_path, url=modules_url) default_content = ModulemdDefaults(**default) dc = DeclarativeContent(content=default_content, d_artifacts=[da]) await self.put(dc) # delete list now that we're done with it for memory savings del modulemd_default_names if comps_downloader: comps_result = await comps_downloader.run() comps = libcomps.Comps() comps.fromxml_f(comps_result.path) with ProgressReport(message='Parsed Comps', code='parsing.comps') as comps_pb: comps_total = (len(comps.groups) + len(comps.categories) + len(comps.environments)) comps_pb.total = comps_total comps_pb.done = comps_total if comps.langpacks: langpack_dict = PackageLangpacks.libcomps_to_dict( comps.langpacks) packagelangpack = PackageLangpacks( matches=strdict_to_dict(comps.langpacks), digest=dict_digest(langpack_dict)) dc = DeclarativeContent(content=packagelangpack) dc.extra_data = defaultdict(list) await self.put(dc) if comps.categories: for category in comps.categories: category_dict = PackageCategory.libcomps_to_dict( category) category_dict['digest'] = dict_digest(category_dict) packagecategory = PackageCategory(**category_dict) dc = DeclarativeContent(content=packagecategory) dc.extra_data = defaultdict(list) if packagecategory.group_ids: for group_id in packagecategory.group_ids: group_to_categories[group_id['name']].append( dc) dc_categories.append(dc) if comps.environments: for environment in comps.environments: environment_dict = PackageEnvironment.libcomps_to_dict( environment) environment_dict['digest'] = dict_digest( environment_dict) packageenvironment = PackageEnvironment( **environment_dict) dc = DeclarativeContent(content=packageenvironment) dc.extra_data = defaultdict(list) if packageenvironment.option_ids: for option_id in packageenvironment.option_ids: optionalgroup_to_environments[ option_id['name']].append(dc) if packageenvironment.group_ids: for group_id in packageenvironment.group_ids: group_to_environments[group_id['name']].append( dc) dc_environments.append(dc) if comps.groups: for group in comps.groups: group_dict = PackageGroup.libcomps_to_dict(group) group_dict['digest'] = dict_digest(group_dict) packagegroup = PackageGroup(**group_dict) dc = DeclarativeContent(content=packagegroup) dc.extra_data = defaultdict(list) if packagegroup.packages: for package in packagegroup.packages: pkgname_to_groups[package['name']].append(dc) if dc.content.id in group_to_categories.keys(): for dc_category in group_to_categories[ dc.content.id]: dc.extra_data['category_relations'].append( dc_category) dc_category.extra_data['packagegroups'].append( dc) if dc.content.id in group_to_environments.keys(): for dc_environment in group_to_environments[ dc.content.id]: dc.extra_data['environment_relations'].append( dc_environment) dc_environment.extra_data[ 'packagegroups'].append(dc) if dc.content.id in optionalgroup_to_environments.keys( ): for dc_environment in optionalgroup_to_environments[ dc.content.id]: dc.extra_data['env_relations_optional'].append( dc_environment) dc_environment.extra_data[ 'optionalgroups'].append(dc) dc_groups.append(dc) for dc_category in dc_categories: await self.put(dc_category) for dc_environment in dc_environments: await self.put(dc_environment) # delete lists now that we're done with them for memory savings del dc_environments del dc_categories # to preserve order, downloaders are created after all repodata urls are identified package_repodata_downloaders = [] for repodata_type in PACKAGE_REPODATA: downloader = self.remote.get_downloader( url=package_repodata_urls[repodata_type]) package_repodata_downloaders.append(downloader.run()) downloaders.append(package_repodata_downloaders) # asyncio.gather is used to preserve the order of results for package repodata pending = [ asyncio.gather(*downloaders_group) for downloaders_group in downloaders ] while pending: done, pending = await asyncio.wait( pending, return_when=asyncio.FIRST_COMPLETED) for downloader in done: try: results = downloader.result() except ClientResponseError as exc: raise HTTPNotFound( reason=_("File not found: {filename}").format( filename=exc.request_info.url)) if results[0].url == package_repodata_urls['primary']: primary_xml_path = results[0].path filelists_xml_path = results[1].path other_xml_path = results[2].path metadata_pb.done += 3 metadata_pb.save() packages = await RpmFirstStage.parse_repodata( primary_xml_path, filelists_xml_path, other_xml_path) # skip SRPM if defined if 'srpm' in self.skip_types: packages = { pkgId: pkg for pkgId, pkg in packages.items() if pkg.arch != 'src' } progress_data = { 'message': 'Parsed Packages', 'code': 'parsing.packages', 'total': len(packages), } with ProgressReport(**progress_data) as packages_pb: for pkg in packages.values(): package = Package( **Package.createrepo_to_dict(pkg)) artifact = Artifact(size=package.size_package) checksum_type = getattr( CHECKSUM_TYPES, package.checksum_type.upper()) setattr(artifact, checksum_type, package.pkgId) url = urljoin(remote_url, package.location_href) filename = os.path.basename( package.location_href) da = DeclarativeArtifact( artifact=artifact, url=url, relative_path=filename, remote=self.remote, deferred_download=self.deferred_download) dc = DeclarativeContent(content=package, d_artifacts=[da]) dc.extra_data = defaultdict(list) # find if a package relates to a modulemd if dc.content.nevra in nevra_to_module.keys(): dc.content.is_modular = True for dc_modulemd in nevra_to_module[ dc.content.nevra]: dc.extra_data[ 'modulemd_relation'].append( dc_modulemd) dc_modulemd.extra_data[ 'package_relation'].append(dc) if dc.content.name in pkgname_to_groups.keys(): for dc_group in pkgname_to_groups[ dc.content.name]: dc.extra_data[ 'group_relations'].append(dc_group) dc_group.extra_data[ 'related_packages'].append(dc) packages_pb.increment() await self.put(dc) elif results[0].url == updateinfo_url: updateinfo_xml_path = results[0].path metadata_pb.increment() updates = await RpmFirstStage.parse_updateinfo( updateinfo_xml_path) progress_data = { 'message': 'Parsed Advisories', 'code': 'parsing.advisories', 'total': len(updates), } with ProgressReport(**progress_data) as advisories_pb: for update in updates: update_record = UpdateRecord( **UpdateRecord.createrepo_to_dict(update)) update_record.digest = hash_update_record( update) future_relations = { 'collections': defaultdict(list), 'references': [] } for collection in update.collections: coll_dict = UpdateCollection.createrepo_to_dict( collection) coll = UpdateCollection(**coll_dict) for package in collection.packages: pkg_dict = UpdateCollectionPackage.createrepo_to_dict( package) pkg = UpdateCollectionPackage( **pkg_dict) future_relations['collections'][ coll].append(pkg) for reference in update.references: reference_dict = UpdateReference.createrepo_to_dict( reference) ref = UpdateReference(**reference_dict) future_relations['references'].append(ref) advisories_pb.increment() dc = DeclarativeContent(content=update_record) dc.extra_data = future_relations await self.put(dc) # now send modules down the pipeline since all relations have been set up for modulemd in modulemd_list: await self.put(modulemd) for dc_group in dc_groups: await self.put(dc_group)
def create_repomd_xml( content, publication, checksum_types, extra_repomdrecords, sub_folder=None, metadata_signing_service=None, ): """ Creates a repomd.xml file. Args: content(app.models.Content): content set publication(pulpcore.plugin.models.Publication): the publication extra_repomdrecords(list): list with data relative to repo metadata files sub_folder(str): name of the folder for sub repos metadata_signing_service (pulpcore.app.models.AsciiArmoredDetachedSigningService): A reference to an associated signing service. """ cwd = os.getcwd() repodata_path = REPODATA_PATH has_modules = False has_comps = False package_checksum_type = checksum_types.get("package") if sub_folder: cwd = os.path.join(cwd, sub_folder) repodata_path = os.path.join(sub_folder, repodata_path) # Prepare metadata files repomd_path = os.path.join(cwd, "repomd.xml") pri_xml_path = os.path.join(cwd, "primary.xml.gz") fil_xml_path = os.path.join(cwd, "filelists.xml.gz") oth_xml_path = os.path.join(cwd, "other.xml.gz") upd_xml_path = os.path.join(cwd, "updateinfo.xml.gz") mod_yml_path = os.path.join(cwd, "modules.yaml") comps_xml_path = os.path.join(cwd, "comps.xml") pri_xml = cr.PrimaryXmlFile(pri_xml_path) fil_xml = cr.FilelistsXmlFile(fil_xml_path) oth_xml = cr.OtherXmlFile(oth_xml_path) upd_xml = cr.UpdateInfoXmlFile(upd_xml_path) if publication.sqlite_metadata: pri_db_path = os.path.join(cwd, "primary.sqlite") fil_db_path = os.path.join(cwd, "filelists.sqlite") oth_db_path = os.path.join(cwd, "other.sqlite") pri_db = cr.PrimarySqlite(pri_db_path) fil_db = cr.FilelistsSqlite(fil_db_path) oth_db = cr.OtherSqlite(oth_db_path) packages = Package.objects.filter(pk__in=content) total_packages = packages.count() pri_xml.set_num_of_pkgs(total_packages) fil_xml.set_num_of_pkgs(total_packages) oth_xml.set_num_of_pkgs(total_packages) # We want to support publishing with a different checksum type than the one built-in to the # package itself, so we need to get the correct checksums somehow if there is an override. # We must also take into consideration that if the package has not been downloaded the only # checksum that is available is the one built-in. # # Since this lookup goes from Package->Content->ContentArtifact->Artifact, performance is a # challenge. We use ContentArtifact as our starting point because it enables us to work with # simple foreign keys and avoid messing with the many-to-many relationship, which doesn't # work with select_related() and performs poorly with prefetch_related(). This is fine # because we know that Packages should only ever have one artifact per content. contentartifact_qs = ( ContentArtifact.objects.filter(content__in=packages.only("pk")). select_related( # content__rpm_package is a bit of a hack, exploiting the way django sets up model # inheritance, but it works and is unlikely to break. All content artifacts being # accessed here have an associated Package since they originally came from the # Package queryset. "artifact", "content__rpm_package", ).only("artifact", "content__rpm_package__checksum_type", "content__rpm_package__pkgId")) pkg_to_hash = {} for ca in contentartifact_qs.iterator(): pkgid = None if package_checksum_type: package_checksum_type = package_checksum_type.lower() pkgid = getattr(ca.artifact, package_checksum_type, None) if pkgid: pkg_to_hash[ca.content_id] = (package_checksum_type, pkgid) else: pkg_to_hash[ca.content_id] = ( ca.content.rpm_package.checksum_type, ca.content.rpm_package.pkgId, ) # Process all packages for package in packages.iterator(): pkg = package.to_createrepo_c() # rewrite the checksum and checksum type with the desired ones (checksum, pkgId) = pkg_to_hash[package.pk] pkg.checksum_type = checksum pkg.pkgId = pkgId pkg_filename = os.path.basename(package.location_href) # this can cause an issue when two same RPM package names appears # a/name1.rpm b/name1.rpm pkg.location_href = os.path.join(PACKAGES_DIRECTORY, pkg_filename[0].lower(), pkg_filename) pri_xml.add_pkg(pkg) fil_xml.add_pkg(pkg) oth_xml.add_pkg(pkg) if publication.sqlite_metadata: pri_db.add_pkg(pkg) fil_db.add_pkg(pkg) oth_db.add_pkg(pkg) # Process update records for update_record in UpdateRecord.objects.filter( pk__in=content).iterator(): upd_xml.add_chunk( cr.xml_dump_updaterecord(update_record.to_createrepo_c())) # Process modulemd and modulemd_defaults with open(mod_yml_path, "ab") as mod_yml: for modulemd in Modulemd.objects.filter(pk__in=content).iterator(): mod_yml.write(modulemd._artifacts.get().file.read()) has_modules = True for default in ModulemdDefaults.objects.filter( pk__in=content).iterator(): mod_yml.write(default._artifacts.get().file.read()) has_modules = True # Process comps comps = libcomps.Comps() for pkg_grp in PackageGroup.objects.filter(pk__in=content).iterator(): group = pkg_grp.pkg_grp_to_libcomps() comps.groups.append(group) has_comps = True for pkg_cat in PackageCategory.objects.filter(pk__in=content).iterator(): cat = pkg_cat.pkg_cat_to_libcomps() comps.categories.append(cat) has_comps = True for pkg_env in PackageEnvironment.objects.filter( pk__in=content).iterator(): env = pkg_env.pkg_env_to_libcomps() comps.environments.append(env) has_comps = True for pkg_lng in PackageLangpacks.objects.filter(pk__in=content).iterator(): comps.langpacks = dict_to_strdict(pkg_lng.matches) has_comps = True comps.toxml_f( comps_xml_path, xml_options={ "default_explicit": True, "empty_groups": True, "uservisible_explicit": True }, ) pri_xml.close() fil_xml.close() oth_xml.close() upd_xml.close() repomd = cr.Repomd() if publication.sqlite_metadata: repomdrecords = [ ("primary", pri_xml_path, pri_db), ("filelists", fil_xml_path, fil_db), ("other", oth_xml_path, oth_db), ("primary_db", pri_db_path, None), ("filelists_db", fil_db_path, None), ("other_db", oth_db_path, None), ("updateinfo", upd_xml_path, None), ] else: repomdrecords = [ ("primary", pri_xml_path, None), ("filelists", fil_xml_path, None), ("other", oth_xml_path, None), ("updateinfo", upd_xml_path, None), ] if has_modules: repomdrecords.append(("modules", mod_yml_path, None)) if has_comps: repomdrecords.append(("group", comps_xml_path, None)) repomdrecords.extend(extra_repomdrecords) sqlite_files = ("primary_db", "filelists_db", "other_db") for name, path, db_to_update in repomdrecords: record = cr.RepomdRecord(name, path) checksum_type = get_checksum_type(name, checksum_types) if name in sqlite_files: record_bz = record.compress_and_fill(checksum_type, cr.BZ2) record_bz.type = name record_bz.rename_file() path = record_bz.location_href.split("/")[-1] repomd.set_record(record_bz) else: record.fill(checksum_type) if db_to_update: db_to_update.dbinfo_update(record.checksum) db_to_update.close() record.rename_file() path = record.location_href.split("/")[-1] repomd.set_record(record) if sub_folder: path = os.path.join(sub_folder, path) PublishedMetadata.create_from_file( relative_path=os.path.join(repodata_path, os.path.basename(path)), publication=publication, file=File(open(path, "rb")), ) with open(repomd_path, "w") as repomd_f: repomd_f.write(repomd.xml_dump()) if metadata_signing_service: signing_service = AsciiArmoredDetachedSigningService.objects.get( pk=metadata_signing_service.pk) sign_results = signing_service.sign(repomd_path) # publish a signed file PublishedMetadata.create_from_file( relative_path=os.path.join(repodata_path, os.path.basename(sign_results["file"])), publication=publication, file=File(open(sign_results["file"], "rb")), ) # publish a detached signature PublishedMetadata.create_from_file( relative_path=os.path.join( repodata_path, os.path.basename(sign_results["signature"])), publication=publication, file=File(open(sign_results["signature"], "rb")), ) # publish a public key required for further verification PublishedMetadata.create_from_file( relative_path=os.path.join(repodata_path, os.path.basename(sign_results["key"])), publication=publication, file=File(open(sign_results["key"], "rb")), ) else: PublishedMetadata.create_from_file( relative_path=os.path.join(repodata_path, os.path.basename(repomd_path)), publication=publication, file=File(open(repomd_path, "rb")), )
def create_repomd_xml(content, publication, extra_repomdrecords, sub_folder=None): """ Creates a repomd.xml file. Args: content(app.models.Content): content set publication(pulpcore.plugin.models.Publication): the publication extra_repomdrecords(list): list with data relative to repo metadata files sub_folder(str): name of the folder for sub repos """ cwd = os.getcwd() repodata_path = REPODATA_PATH has_modules = False has_comps = False if sub_folder: cwd = os.path.join(cwd, sub_folder) repodata_path = os.path.join(sub_folder, repodata_path) # Prepare metadata files repomd_path = os.path.join(cwd, "repomd.xml") pri_xml_path = os.path.join(cwd, "primary.xml.gz") fil_xml_path = os.path.join(cwd, "filelists.xml.gz") oth_xml_path = os.path.join(cwd, "other.xml.gz") pri_db_path = os.path.join(cwd, "primary.sqlite") fil_db_path = os.path.join(cwd, "filelists.sqlite") oth_db_path = os.path.join(cwd, "other.sqlite") upd_xml_path = os.path.join(cwd, "updateinfo.xml.gz") mod_yml_path = os.path.join(cwd, "modules.yaml") comps_xml_path = os.path.join(cwd, "comps.xml") pri_xml = cr.PrimaryXmlFile(pri_xml_path) fil_xml = cr.FilelistsXmlFile(fil_xml_path) oth_xml = cr.OtherXmlFile(oth_xml_path) pri_db = cr.PrimarySqlite(pri_db_path) fil_db = cr.FilelistsSqlite(fil_db_path) oth_db = cr.OtherSqlite(oth_db_path) upd_xml = cr.UpdateInfoXmlFile(upd_xml_path) packages = Package.objects.filter(pk__in=content) total_packages = packages.count() pri_xml.set_num_of_pkgs(total_packages) fil_xml.set_num_of_pkgs(total_packages) oth_xml.set_num_of_pkgs(total_packages) # Process all packages for package in packages.iterator(): pkg = package.to_createrepo_c() pkg.location_href = package.contentartifact_set.only('relative_path').first().relative_path pri_xml.add_pkg(pkg) fil_xml.add_pkg(pkg) oth_xml.add_pkg(pkg) pri_db.add_pkg(pkg) fil_db.add_pkg(pkg) oth_db.add_pkg(pkg) # Process update records for update_record in UpdateRecord.objects.filter(pk__in=content).iterator(): upd_xml.add_chunk(cr.xml_dump_updaterecord(update_record.to_createrepo_c())) # Process modulemd and modulemd_defaults with open(mod_yml_path, 'ab') as mod_yml: for modulemd in Modulemd.objects.filter(pk__in=content).iterator(): mod_yml.write(modulemd._artifacts.get().file.read()) has_modules = True for default in ModulemdDefaults.objects.filter(pk__in=content).iterator(): mod_yml.write(default._artifacts.get().file.read()) has_modules = True # Process comps comps = libcomps.Comps() for pkg_grp in PackageGroup.objects.filter(pk__in=content).iterator(): group = pkg_grp.pkg_grp_to_libcomps() comps.groups.append(group) has_comps = True for pkg_cat in PackageCategory.objects.filter(pk__in=content).iterator(): cat = pkg_cat.pkg_cat_to_libcomps() comps.categories.append(cat) has_comps = True for pkg_env in PackageEnvironment.objects.filter(pk__in=content).iterator(): env = pkg_env.pkg_env_to_libcomps() comps.environments.append(env) has_comps = True for pkg_lng in PackageLangpacks.objects.filter(pk__in=content).iterator(): comps.langpacks = dict_to_strdict(pkg_lng.matches) has_comps = True comps.toxml_f(comps_xml_path, xml_options={"default_explicit": True, "empty_groups": True, "uservisible_explicit": True}) pri_xml.close() fil_xml.close() oth_xml.close() upd_xml.close() repomd = cr.Repomd() repomdrecords = [("primary", pri_xml_path, pri_db), ("filelists", fil_xml_path, fil_db), ("other", oth_xml_path, oth_db), ("primary_db", pri_db_path, None), ("filelists_db", fil_db_path, None), ("other_db", oth_db_path, None), ("updateinfo", upd_xml_path, None)] if has_modules: repomdrecords.append(("modules", mod_yml_path, None)) if has_comps: repomdrecords.append(("group", comps_xml_path, None)) repomdrecords.extend(extra_repomdrecords) sqlite_files = ("primary_db", "filelists_db", "other_db") for name, path, db_to_update in repomdrecords: record = cr.RepomdRecord(name, path) if name in sqlite_files: record_bz = record.compress_and_fill(cr.SHA256, cr.BZ2) record_bz.type = name record_bz.rename_file() path = record_bz.location_href.split('/')[-1] repomd.set_record(record_bz) else: record.fill(cr.SHA256) if (db_to_update): db_to_update.dbinfo_update(record.checksum) db_to_update.close() record.rename_file() path = record.location_href.split('/')[-1] repomd.set_record(record) if sub_folder: path = os.path.join(sub_folder, path) PublishedMetadata.create_from_file( relative_path=os.path.join(repodata_path, os.path.basename(path)), publication=publication, file=File(open(path, 'rb')) ) with open(repomd_path, "w") as repomd_f: repomd_f.write(repomd.xml_dump()) PublishedMetadata.create_from_file( relative_path=os.path.join(repodata_path, os.path.basename(repomd_path)), publication=publication, file=File(open(repomd_path, 'rb')) )
def cr_create_md(repodata_path, pkglist=None, log=sys.stdout): if pkglist is None: pkglist = cr_get_pkg_list(repo_base, log) pri_xml_path = os.path.join(repodata_path, 'primary.xml.gz') fil_xml_path = os.path.join(repodata_path, 'filelists.xml.gz') oth_xml_path = os.path.join(repodata_path, 'other.xml.gz') pri_db_path = os.path.join(repodata_path, 'primary.sqlite') fil_db_path = os.path.join(repodata_path, 'filelists.sqlite') oth_db_path = os.path.join(repodata_path, 'other.sqlite') def __create_xml(queues, xml_path, xml_func, name): cs = cr.ContentStat(cr.SHA256) xml = xml_func(xml_path, contentstat=cs) xml.set_num_of_pkgs(len(pkglist)) for pkg in pkglist: xml.add_pkg(pkg) xml.close() queues['master'].put( ((name, xml_path), (cs.checksum, cs.size, cs.checksum_type)), True) def __create_db(queues, db_path, db_func, name): db = db_func(db_path) for pkg in pkglist: db.add_pkg(pkg) db.dbinfo_update(queues[name].get(True)) db.close() cs = cr.ContentStat(cr.SHA256) cr.compress_file_with_stat( db_path, db_path + cr.compression_suffix(cr.BZ2_COMPRESSION), cr.BZ2_COMPRESSION, cs) os.remove(db_path) queues['master'].put( ((name + '_db', db_path + cr.compression_suffix(cr.BZ2_COMPRESSION)), (cs.checksum, cs.size, cs.checksum_type)), True) queue_manager = multiprocessing.Manager() queues = dict({ 'master': queue_manager.Queue(), 'primary': queue_manager.Queue(), 'filelists': queue_manager.Queue(), 'other': queue_manager.Queue(), }) log.write('[%s] Generating metadata in %s\n' % (stamp(), repodata_path)) th = [0] * 6 th[0] = multiprocessing.Process(target=__create_xml, args=(queues, pri_xml_path, cr.PrimaryXmlFile, 'primary')) th[0].start() th[1] = multiprocessing.Process(target=__create_xml, args=(queues, fil_xml_path, cr.FilelistsXmlFile, 'filelists')) th[1].start() th[2] = multiprocessing.Process(target=__create_xml, args=(queues, oth_xml_path, cr.OtherXmlFile, 'other')) th[2].start() th[3] = multiprocessing.Process(target=__create_db, args=(queues, pri_db_path, cr.PrimarySqlite, 'primary')) th[3].start() th[4] = multiprocessing.Process(target=__create_db, args=(queues, fil_db_path, cr.FilelistsSqlite, 'filelists')) th[4].start() th[5] = multiprocessing.Process(target=__create_db, args=(queues, oth_db_path, cr.OtherSqlite, 'other')) th[5].start() repomd = cr.Repomd() data_files = set() for i in range(0, 6): rf = queues['master'].get(True) r = cr.RepomdRecord(*rf[0]) r.checksum_open_type = cr.checksum_name_str(rf[1][2]) r.checksum_open = rf[1][0] r.size_open = rf[1][1] r.fill(cr.SHA256) if not rf[0][0].endswith('_db'): queues[rf[0][0]].put(r.checksum, True) r.rename_file() r.location_href = os.path.join('repodata', os.path.basename(r.location_href)) data_files.add(r.location_real) repomd.set_record(r) for t in th: t.join() repomd.sort_records() return (repomd.xml_dump(), data_files)
def create_rempomd_xml(packages, publication, extra_repomdrecords, sub_folder=None): """ Creates a repomd.xml file. Args: packages(app.models.Package): set of packages publication(pulpcore.plugin.models.Publication): the publication extra_repomdrecords(list): list with data relative to repo metadata files sub_folder(str): name of the folder for sub repos """ cwd = os.getcwd() repodata_path = REPODATA_PATH has_modules = False if sub_folder: cwd = os.path.join(cwd, sub_folder) repodata_path = os.path.join(sub_folder, repodata_path) # Prepare metadata files repomd_path = os.path.join(cwd, "repomd.xml") pri_xml_path = os.path.join(cwd, "primary.xml.gz") fil_xml_path = os.path.join(cwd, "filelists.xml.gz") oth_xml_path = os.path.join(cwd, "other.xml.gz") pri_db_path = os.path.join(cwd, "primary.sqlite") fil_db_path = os.path.join(cwd, "filelists.sqlite") oth_db_path = os.path.join(cwd, "other.sqlite") upd_xml_path = os.path.join(cwd, "updateinfo.xml.gz") mod_yml_path = os.path.join(cwd, "modules.yaml") pri_xml = cr.PrimaryXmlFile(pri_xml_path) fil_xml = cr.FilelistsXmlFile(fil_xml_path) oth_xml = cr.OtherXmlFile(oth_xml_path) pri_db = cr.PrimarySqlite(pri_db_path) fil_db = cr.FilelistsSqlite(fil_db_path) oth_db = cr.OtherSqlite(oth_db_path) upd_xml = cr.UpdateInfoXmlFile(upd_xml_path) pri_xml.set_num_of_pkgs(len(packages)) fil_xml.set_num_of_pkgs(len(packages)) oth_xml.set_num_of_pkgs(len(packages)) # Process all packages for package in packages: pkg = package.to_createrepo_c() pkg.location_href = package.contentartifact_set.first().relative_path pri_xml.add_pkg(pkg) fil_xml.add_pkg(pkg) oth_xml.add_pkg(pkg) pri_db.add_pkg(pkg) fil_db.add_pkg(pkg) oth_db.add_pkg(pkg) # Process update records for update_record in UpdateRecord.objects.filter( pk__in=publication.repository_version.content): upd_xml.add_chunk(update_record_xml(update_record)) # Process modulemd and modulemd_defaults with open(mod_yml_path, 'ab') as mod_yml: for modulemd in Modulemd.objects.filter( pk__in=publication.repository_version.content): mod_yml.write(modulemd._artifacts.get().file.read()) has_modules = True for default in ModulemdDefaults.objects.filter( pk__in=publication.repository_version.content): mod_yml.write(default._artifacts.get().file.read()) has_modules = True pri_xml.close() fil_xml.close() oth_xml.close() upd_xml.close() repomd = cr.Repomd() repomdrecords = [("primary", pri_xml_path, pri_db), ("filelists", fil_xml_path, fil_db), ("other", oth_xml_path, oth_db), ("primary_db", pri_db_path, None), ("filelists_db", fil_db_path, None), ("other_db", oth_db_path, None), ("updateinfo", upd_xml_path, None)] if has_modules: repomdrecords.append(("modules", mod_yml_path, None)) repomdrecords.extend(extra_repomdrecords) sqlite_files = ("primary_db", "filelists_db", "other_db") for name, path, db_to_update in repomdrecords: record = cr.RepomdRecord(name, path) if name in sqlite_files: record_bz = record.compress_and_fill(cr.SHA256, cr.BZ2) record_bz.type = name record_bz.rename_file() path = record_bz.location_href.split('/')[-1] repomd.set_record(record_bz) elif name == "modules": record_md = record.compress_and_fill(cr.SHA256, cr.GZ) record_md.type = name record_md.rename_file() path = record_md.location_href.split('/')[-1] repomd.set_record(record_md) else: record.fill(cr.SHA256) if (db_to_update): db_to_update.dbinfo_update(record.checksum) db_to_update.close() record.rename_file() path = record.location_href.split('/')[-1] repomd.set_record(record) if sub_folder: path = os.path.join(sub_folder, path) PublishedMetadata.create_from_file(relative_path=os.path.join( repodata_path, os.path.basename(path)), publication=publication, file=File(open(path, 'rb'))) with open(repomd_path, "w") as repomd_f: repomd_f.write(repomd.xml_dump()) PublishedMetadata.create_from_file(relative_path=os.path.join( repodata_path, os.path.basename(repomd_path)), publication=publication, file=File(open(repomd_path, 'rb')))
async def run(self): """ Build `DeclarativeContent` from the repodata. """ packages_pb = ProgressBar(message='Parsed Packages') erratum_pb = ProgressBar(message='Parsed Erratum') packages_pb.save() erratum_pb.save() with ProgressBar(message='Downloading Metadata Files') as metadata_pb: downloader = self.remote.get_downloader( url=urljoin(self.remote.url, 'repodata/repomd.xml')) # TODO: decide how to distinguish between a mirror list and a normal repo result = await downloader.run() metadata_pb.increment() repomd_path = result.path repomd = cr.Repomd(repomd_path) package_repodata_urls = {} downloaders = [] for record in repomd.records: if record.type in PACKAGE_REPODATA: package_repodata_urls[record.type] = urljoin( self.remote.url, record.location_href) elif record.type in UPDATE_REPODATA: updateinfo_url = urljoin(self.remote.url, record.location_href) downloader = self.remote.get_downloader(url=updateinfo_url) downloaders.append([downloader.run()]) else: log.info( _('Unknown repodata type: {t}. Skipped.').format( t=record.type)) # TODO: skip databases, save unknown types to publish them as-is # to preserve order, downloaders are created after all repodata urls are identified package_repodata_downloaders = [] for repodata_type in PACKAGE_REPODATA: downloader = self.remote.get_downloader( url=package_repodata_urls[repodata_type]) package_repodata_downloaders.append(downloader.run()) downloaders.append(package_repodata_downloaders) # asyncio.gather is used to preserve the order of results for package repodata pending = [ asyncio.gather(*downloaders_group) for downloaders_group in downloaders ] while pending: done, pending = await asyncio.wait( pending, return_when=asyncio.FIRST_COMPLETED) for downloader in done: results = downloader.result() if results[0].url == package_repodata_urls['primary']: primary_xml_path = results[0].path filelists_xml_path = results[1].path other_xml_path = results[2].path metadata_pb.done += 3 metadata_pb.save() packages = await RpmFirstStage.parse_repodata( primary_xml_path, filelists_xml_path, other_xml_path) packages_pb.total = len(packages) packages_pb.state = 'running' packages_pb.save() for pkg in packages.values(): package = Package( **Package.createrepo_to_dict(pkg)) artifact = Artifact(size=package.size_package) checksum_type = getattr( CHECKSUM_TYPES, package.checksum_type.upper()) setattr(artifact, checksum_type, package.pkgId) url = urljoin(self.remote.url, package.location_href) filename = os.path.basename(package.location_href) da = DeclarativeArtifact( artifact=artifact, url=url, relative_path=filename, remote=self.remote, deferred_download=self.deferred_download) dc = DeclarativeContent(content=package, d_artifacts=[da]) packages_pb.increment() await self.put(dc) elif results[0].url == updateinfo_url: updateinfo_xml_path = results[0].path metadata_pb.increment() updates = await RpmFirstStage.parse_updateinfo( updateinfo_xml_path) erratum_pb.total = len(updates) erratum_pb.state = 'running' erratum_pb.save() for update in updates: update_record = UpdateRecord( **UpdateRecord.createrepo_to_dict(update)) update_record.digest = RpmFirstStage.hash_update_record( update) future_relations = { 'collections': defaultdict(list), 'references': [] } for collection in update.collections: coll_dict = UpdateCollection.createrepo_to_dict( collection) coll = UpdateCollection(**coll_dict) for package in collection.packages: pkg_dict = UpdateCollectionPackage.createrepo_to_dict( package) pkg = UpdateCollectionPackage(**pkg_dict) future_relations['collections'][ coll].append(pkg) for reference in update.references: reference_dict = UpdateReference.createrepo_to_dict( reference) ref = UpdateReference(**reference_dict) future_relations['references'].append(ref) erratum_pb.increment() dc = DeclarativeContent(content=update_record) dc.extra_data = future_relations await self.put(dc) packages_pb.state = 'completed' erratum_pb.state = 'completed' packages_pb.save() erratum_pb.save()
def oneshot_callback(): """Parse one file at a time into a set of packages. Use of this method is discouraged. newpkgcb -------- Via newpkgcb (Package callback) you could directly affect if the current package element should be parsed or not. This decision could be based on three values that are available as attributtes in the <package> element. This values are: - pkgId (package checksum) - name (package name) - arch (package architecture) (Note: This is applicable only for filelists.xml and other.xml, primary.xml doesn't contain this information in <package> element) If newpkgcb returns a package object, the parsed data will be loaded to this package object. If it returns a None, package element is skiped. This could help you to reduce a memory requirements because non wanted packages could be skiped without need to store them into the memory. If no newpkgcb is specified, default callback returning a new package object is used. pkgcb ----- Callback called when a <package> element parsing is done. Its argument is a package object that has been previously returned by the newpkgcb. This function should return True if parsing should continue or False if parsing should be interrupted. Note: Both callbacks are optional, BUT at least one MUST be used (newpkgcb or pkgcb)! warningcb --------- Warning callbacks is called when a non-fatal oddity of prased XML is detected. If True is returned, parsing continues. If return value is False, parsing is terminated. This callback is optional. """ primary_xml_path = None filelists_xml_path = None other_xml_path = None # # repomd.xml parsing # # Parse repomd.xml to get paths (1. Method - Repomd object based) # Pros: Easy to use repomd = cr.Repomd(os.path.join(REPO_PATH, "repodata/repomd.xml")) # Parse repomd.xml (2. Method - Parser based) # Pros: Warning callback could be specified def warningcb(warning_type, message): """Optional callback for warnings about wierd stuff and formatting in XML. :param warning_type: Integer value. One from the XML_WARNING_* constants. :param message: String message. """ print("PARSER WARNING: %s" % message) return True repomd2 = cr.Repomd() cr.xml_parse_repomd(os.path.join(REPO_PATH, "repodata/repomd.xml"), repomd2, warningcb) # Get stuff we need # (repomd or repomd2 could be used, both have the same values) for record in repomd.records: if record.type == "primary": primary_xml_path = record.location_href elif record.type == "filelists": filelists_xml_path = record.location_href elif record.type == "other": other_xml_path = record.location_href # # Main XML metadata parsing (primary, filelists, other) # packages = {} def pkgcb(pkg): # Called when whole package entry in xml is parsed packages[pkg.pkgId] = pkg def newpkgcb(pkgId, name, arch): # Called when new package entry is encountered # And only opening <package> element is parsed # This function has to return a package to which # parsed data will be added or None if this package # should be skiped. return packages.get(pkgId, None) # Option do_files tells primary parser to skip <file> element of package. # If you plan to parse filelists.xml after the primary.xml, always # set do_files to False. cr.xml_parse_primary(os.path.join(REPO_PATH, primary_xml_path), pkgcb=pkgcb, do_files=False, warningcb=warningcb) cr.xml_parse_filelists(os.path.join(REPO_PATH, filelists_xml_path), newpkgcb=newpkgcb, warningcb=warningcb) cr.xml_parse_other(os.path.join(REPO_PATH, other_xml_path), newpkgcb=newpkgcb, warningcb=warningcb) for pkg in packages.values(): print_package_info(pkg)