def _prepare_versionlock_lists( subvol: Subvol, snapshot_dir: Path, list_path: Path ) -> Dict[str, Tuple[str, int]]: ''' Returns a map of "in-snapshot path" -> "tempfile with its contents", with the intention that the tempfile in the value will be a read-only bind-mount over the path in the key. ''' # `dnf` and `yum` expect different formats, so we parse our own. with open(list_path) as rf: envras = [l.split('\t') for l in rf] templates = {'yum': '{e}:{n}-{v}-{r}.{a}', 'dnf': '{n}-{e}:{v}-{r}.{a}'} dest_to_src_and_size = {} with temp_dir() as d: # Only bind-mount lists for those binaries that exist in the snapshot. for prog in set( f'{p}' for p in (subvol.path(snapshot_dir)).listdir() ) & set(templates.keys()): template = templates[prog] src = d / (prog + '-versionlock.list') with create_ro(src, 'w') as wf: for e, n, v, r, a in envras: wf.write(template.format(e=e, n=n, v=v, r=r, a=a)) set_new_key( dest_to_src_and_size, # This path convention must match how `write_yum_dnf_conf.py` # and `rpm_repo_snapshot.bzl` set up their output. snapshot_dir / f'{prog}/etc/{prog}/plugins/versionlock.list', (src, len(envras)) ) yield dest_to_src_and_size
def _normalize_snapshot_to_versionlock( snapshot_to_vl: Iterable[Tuple[Path, Path]], snapshots: Iterable[Path], ) -> Mapping[Path, Path]: snapshots = frozenset(snapshots) s_to_vl = {} for s, vl in snapshot_to_vl: assert s in snapshots, (s, snapshots) set_new_key(s_to_vl, s, vl) return MappingProxyType(s_to_vl)
def _memoize_error(self, obj, error: ReportableError): ''' Any size or checksum errors we see are likely to be permanent, so we MUTATE `obj` with the error, hiding the old `storage_id` inside. ''' error_dict = { **error.to_dict(), # Since `storage_id` is hidden, `send_head` will show the error. 'storage_id': obj.pop('storage_id'), } set_new_key(obj, 'error', error_dict)
def _download_rpms(self, rpms: Iterable[Rpm], shard: RpmShard): log.info(f'''`{self._repo_name}` has {len(rpms)} RPMs weighing { sum(r.size for r in rpms) :,} bytes''') storage_id_to_rpm = {} # Download in random order to reduce collisions from racing writers. for rpm in shuffled(rpms): if not shard.in_shard(rpm): continue with self._repo_db_ctx as db: # If we get no `storage_id` back, there are 3 possibilities: # - `rpm.filename` was never seen before. # - `rpm.filename` was seen before, but it was hashed with # different algorithm(s), so we MUST download and # compute the canonical checksum to know if its contents # are the same. # - `rpm.filename` was seen before, **AND** one of the # prior checksums used `rpm.checksum.algorithms`, but # produced a different hash value. In other words, this # is a `MutableRpmError`, because the same filename must # have had two different contents. We COULD explicitly # detect this error here, and avoid the download. # However, this severe error should be infrequent, and we # actually get valuable information from the download -- # this lets us know whether the file is wrong or the # repodata is wrong. storage_id, canonical_checksum = \ db.get_rpm_storage_id_and_checksum(self._rpm_table, rpm) # If the RPM is already stored with a matching checksum, just # update its `.canonical_checksum`. if storage_id: rpm = rpm._replace(canonical_checksum=canonical_checksum) else: # We have to download the RPM. try: with _reportable_http_errors(rpm.location): storage_id, rpm = self._download_rpm(rpm) # IMPORTANT: All the classes of errors that we handle below # have the property that we would not have stored anything # new in the DB, meaning that such failed RPMs will be # retried on the next snapshot attempt. except ReportableError as ex: # RPM checksum validation errors, scenarios where the # same RPM name occurs with different checksums, etc. storage_id = ex # Detect if this RPM filename occurs with different contents. if not isinstance(storage_id, ReportableError): storage_id = self._detect_mutable_rpms(rpm, storage_id) set_new_key(storage_id_to_rpm, storage_id, rpm) assert len(storage_id_to_rpm) == sum(shard.in_shard(r) for r in rpms) return storage_id_to_rpm
def _download_repodatas( repo: YumDnfConfRepo, repomd: RepoMetadata, cfg: DownloadConfig) -> Tuple[Set[Rpm], Mapping[str, Repodata]]: rpms = None # We'll extract these from the primary repodata storage_id_to_repodata = {} # Newly stored **and** pre-existing repodata_table = RepodataTable() primary_repodata = pick_primary_repodata(repomd.repodatas) log_size(f"`{repo.name}` repodata weighs", sum(rd.size for rd in repomd.repodatas)) rw_db_conn = cfg.new_db_conn(readonly=False) with ThreadPoolExecutor(max_workers=cfg.threads) as executor: futures = [ executor.submit( _download_repodata, repodata, repo_url=repo.base_url, repodata_table=repodata_table, cfg=cfg, is_primary=repodata is primary_repodata, ) for repodata in shuffled(repomd.repodatas) ] for future in as_completed(futures): res = future.result() if res.newly_stored: # Don't want to store errors into the repo db -- this should # never be the case as `newly_stored` is only True when we # successfully commit a new repodata to storage assert not isinstance(res.storage_id, ReportableError) # This repodata was newly downloaded and stored in storage, so # we store its storage_id to repo_db regardless of whether we # encounter fatal errors later on that fail the snapshot; see # docblock in `repo_downloader.py` for reasoning storage_id = maybe_write_id(res.repodata, res.storage_id, repodata_table, rw_db_conn) else: storage_id = res.storage_id if res.maybe_rpms is not None: # RPMs will only have been returned by the primary, thus we # should only enter this block once assert rpms is None # Convert to a set to work around buggy repodatas, which # list the same RPM object twice. rpms = frozenset(res.maybe_rpms) set_new_key(storage_id_to_repodata, storage_id, res.repodata) # It's possible that for non-primary repodatas we received errors when # downloading - in that case we store the error in the sqlite db, thus the # dict should contain an entry for every single repodata assert len(storage_id_to_repodata) == len(repomd.repodatas) if not rpms: log.warning(f"Repo {repo} has no RPMs") return rpms, storage_id_to_repodata
def _download_repodatas( self, repomd: RepoMetadata, # We mutate this dictionary on-commit to allow the caller to clean # up any stored repodata blobs if the download fails part-way. persist_storage_id_to_repodata: Mapping[str, Repodata], visitors: Iterable['RepoObjectVisitor'], ) -> Tuple[Set[Rpm], Mapping[str, Repodata]]: rpms = None # We'll extract these from the primary repodata storage_id_to_repodata = {} # Newly stored **and** pre-existing primary_repodata = pick_primary_repodata(repomd.repodatas) log.info(f'''`{self._repo_name}` repodata weighs { sum(rd.size for rd in repomd.repodatas) :,} bytes''') # Visitors see all declared repodata, even if some downloads fail. for visitor in visitors: for repodata in repomd.repodatas: visitor.visit_repodata(repodata) # Download in random order to reduce collisions from racing writers. for repodata in shuffled(repomd.repodatas): try: with _reportable_http_errors(repodata.location): newly_stored, storage_id, maybe_rpms = \ self._download_repodata( repodata, is_primary=repodata is primary_repodata, ) if newly_stored: set_new_key( persist_storage_id_to_repodata, storage_id, repodata, ) if maybe_rpms is not None: # Convert to a set to work around buggy repodatas, which # list the same RPM object twice. rpms = set(maybe_rpms) except ReportableError as ex: # We cannot proceed without the primary file -- raise here # to trigger the "top-level retry" in the snapshot driver. if repodata is primary_repodata: raise # This fake "storage ID" is not written to # `persist_storage_id_to_repodata`, so we will never attempt # to write it to the DB. However, it does end up in # `repodata.json`, so the error is visible. storage_id = ex set_new_key(storage_id_to_repodata, storage_id, repodata) assert len(storage_id_to_repodata) == len(repomd.repodatas) assert rpms, 'Is the repo empty?' return rpms, storage_id_to_repodata
def _join_snapshots(self, snapshots: List[RepoSnapshot]) -> RepoSnapshot: # repomd & repodata should be the same across all shards repomd = snapshots[0].repomd storage_id_to_repodata = snapshots[0].storage_id_to_repodata storage_id_to_rpm = {} for snapshot in snapshots: self._check_repomd_equal(repomd, snapshot.repomd) self.assertEqual( storage_id_to_repodata, snapshot.storage_id_to_repodata, ) for sid, rpm in snapshot.storage_id_to_rpm.items(): set_new_key(storage_id_to_rpm, sid, rpm) return RepoSnapshot( repomd=repomd, storage_id_to_repodata=storage_id_to_repodata, storage_id_to_rpm=storage_id_to_rpm, )
def add_snapshot_db_objs(db): location_to_obj = {} for repo, build_timestamp, metadata_xml in db.execute(''' SELECT "repo", "build_timestamp", "metadata_xml" FROM "repomd" ''').fetchall(): set_new_key( location_to_obj, os.path.join(repo, 'repodata/repomd.xml'), { 'size': len(metadata_xml), 'build_timestamp': build_timestamp, 'content_bytes': metadata_xml.encode(), }) for table in ['repodata', 'rpm']: for ( repo, path, build_timestamp, checksum, error, error_json, size, storage_id, ) in db.execute(f''' SELECT "repo", "path", "build_timestamp", "checksum", "error", "error_json", "size", "storage_id" FROM "{table}" ''').fetchall(): obj = { 'checksum': checksum, 'size': size, 'build_timestamp': build_timestamp, } # `storage_id` is populated in the DB table for `mutable_rpm` # errors, but we don't want to serve up those files. if storage_id and not error and not error_json: obj['storage_id'] = storage_id elif error and error_json: obj['error'] = {'error': error, **json.loads(error_json)} else: # pragma: no cover raise AssertionError(f'{storage_id} {error} {error_json}') set_new_key(location_to_obj, os.path.join(repo, path), obj) return location_to_obj
def read_snapshot_dir(path: str): location_to_obj = {} for repo in os.listdir(path): if repo == 'yum.conf': continue repo_path = Path(path) / repo for filename in ['rpm.json', 'repodata.json']: with open(repo_path / filename) as infile: for location, obj in json.load(infile).items(): set_new_key( location_to_obj, os.path.join(repo, location), obj ) # Re-parse and serialize the metadata to a format that ALMOST # matches the other blobs (imitating `RepoSnapshot.to_directory()`). # If useful, it would not be offensive to make such a `repomd.json` # be emitted by RepoSnapshot, instead of `repomd.xml`. Caveat: JSON # isn't suitable for bytes, and the XML is currently bytes. with open(repo_path / 'repomd.xml', 'rb') as infile: repomd = RepoMetadata.new(xml=infile.read()) location_to_obj[os.path.join(repo, 'repodata/repomd.xml')] = { 'size': repomd.size, 'build_timestamp': repomd.build_timestamp, 'content_bytes': repomd.xml, # Instead of `storage_id` } # Similarly, make JSON metadata for the repo's GPG keys. key_dir = repo_path / 'gpg_keys' for key_filename in os.listdir(key_dir.decode()): with open(key_dir / key_filename, 'rb') as infile: key_content = infile.read() location_to_obj[os.path.join(repo, key_filename)] = { 'size': len(key_content), # We don't have a good timestamp for these, so set it to # "now". Caching efficiency losses should be negligible :) 'build_timestamp': int(time.time()), 'content_bytes': key_content, # Instead of `storage_id` } return location_to_obj
def wrapped_popen(opts: _NspawnOpts, popen_args: PopenArgs): with ExitStack() as stack: dest_to_src = {} for snapshot, versionlock in snapshot_to_versionlock.items(): for dest, (src, vl_size) in stack.enter_context( _prepare_versionlock_lists( # Same note as in `inject_repo_servers.py` regarding # the usage of the pre-snapshot subvolume. opts.layer, snapshot, versionlock, ) ).items(): log.info(f'Locking {vl_size} RPM versions via {dest}') set_new_key(dest_to_src, dest, src) yield stack.enter_context(popen( opts._replace( bindmount_ro=(*opts.bindmount_ro, *( (s, d) for d, s in dest_to_src.items() )), ), popen_args, ))
def _commit_repodata_and_cancel_cleanup( self, repomd: RepoMetadata, # We'll replace our IDs by those that actually ended up in the DB storage_id_to_repodata: Mapping[str, Repodata], # Will retain only those IDs that are unused by the DB and need cleanup persist_storage_id_to_repodata: Mapping[str, Repodata], ): with self._repo_db_ctx as repo_db: # We cannot touch `persist_storage_id_to_repodata` in the loop # because until the transaction commits, we must be ready to # delete all new storage IDs. So instead, we will construct the # post-commit version of that dictionary (i.e. blobs we need to # delete even if the transaction lands), in this variable: unneeded_storage_id_to_repodata = {} for storage_id, repodata in persist_storage_id_to_repodata.items(): assert not isinstance(storage_id, ReportableError), repodata db_storage_id = repo_db.maybe_store(self._repodata_table, repodata, storage_id) _log_if_storage_ids_differ(repodata, storage_id, db_storage_id) if db_storage_id != storage_id: set_new_key( storage_id_to_repodata, db_storage_id, storage_id_to_repodata.pop(storage_id), ) set_new_key( unneeded_storage_id_to_repodata, storage_id, repodata, ) repo_db.store_repomd(self._repo_name, repomd) repo_db.commit() # The DB commit was successful, and we're about to exit the # repo_db context, which might, at worst, raise its own error. # Therefore, let's prevent the `finally` cleanup from deleting # the blobs whose IDs we just committed to the DB. persist_storage_id_to_repodata.clear() persist_storage_id_to_repodata.update( unneeded_storage_id_to_repodata)
def _reduce_equal_snapshots( self, repo_snapshots: List[Tuple[YumDnfConfRepo, RepoSnapshot]]) -> RepoSnapshot: self.assertGreater(len(repo_snapshots), 0) # repo, repomd & repodata should be the same across all shards head_repo, head_snapshot = repo_snapshots[0] head_repomd = head_snapshot.repomd head_storage_id_to_repodata = head_snapshot.storage_id_to_repodata storage_id_to_rpm = {} for repo, snapshot in repo_snapshots[1:]: self.assertEqual(head_repo, repo) self._check_repomd_equal(head_repomd, snapshot.repomd) self.assertEqual(head_storage_id_to_repodata, snapshot.storage_id_to_repodata) for sid, rpm in snapshot.storage_id_to_rpm.items(): set_new_key(storage_id_to_rpm, sid, rpm) return RepoSnapshot( repomd=head_repomd, storage_id_to_repodata=head_storage_id_to_repodata, storage_id_to_rpm=storage_id_to_rpm, )
def _download_rpms( repo: YumDnfConfRepo, rpm_table: RpmTable, rpms: Iterable[Rpm], all_snapshot_universes: Set[str], cfg: DownloadConfig, ): log_size(f"`{repo.name}` has {len(rpms)} RPMs weighing", sum(r.size for r in rpms)) storage_id_to_rpm = {} rw_db_conn = cfg.new_db_conn(readonly=False) ro_db_conn = cfg.new_db_conn(readonly=True) with ThreadPoolExecutor(max_workers=cfg.threads) as executor: futures = [ executor.submit(_handle_rpm, rpm, repo.base_url, rpm_table, all_snapshot_universes, cfg) # Download in random order to reduce collisions from racing writers. for rpm in shuffled(rpms) if cfg.rpm_shard.in_shard(rpm) ] for future in as_completed(futures): rpm, res_storage_id = future.result() if not isinstance(res_storage_id, ReportableError): # If it's valid, we store this storage_id to repo_db regardless of # whether we encounter fatal errors later on in the execution and # don't finish the snapshot - see top-level docblock for reasoning res_storage_id = maybe_write_id(rpm, res_storage_id, rpm_table, rw_db_conn) # Detect if this RPM NEVRA occurs with different contents. res_storage_id = _detect_mutable_rpms(rpm, rpm_table, res_storage_id, all_snapshot_universes, ro_db_conn) set_new_key(storage_id_to_rpm, res_storage_id, rpm) assert len(storage_id_to_rpm) == sum( cfg.rpm_shard.in_shard(r) for r in rpms) return storage_id_to_rpm