def _prepare_versionlock_lists(
    subvol: Subvol, snapshot_dir: Path, list_path: Path
) -> Dict[str, Tuple[str, int]]:
    '''
    Returns a map of "in-snapshot path" -> "tempfile with its contents",
    with the intention that the tempfile in the value will be a read-only
    bind-mount over the path in the key.
    '''
    # `dnf` and `yum` expect different formats, so we parse our own.
    with open(list_path) as rf:
        envras = [l.split('\t') for l in rf]
    templates = {'yum': '{e}:{n}-{v}-{r}.{a}', 'dnf': '{n}-{e}:{v}-{r}.{a}'}
    dest_to_src_and_size = {}
    with temp_dir() as d:
        # Only bind-mount lists for those binaries that exist in the snapshot.
        for prog in set(
            f'{p}' for p in (subvol.path(snapshot_dir)).listdir()
        ) & set(templates.keys()):
            template = templates[prog]
            src = d / (prog + '-versionlock.list')
            with create_ro(src, 'w') as wf:
                for e, n, v, r, a in envras:
                    wf.write(template.format(e=e, n=n, v=v, r=r, a=a))
            set_new_key(
                dest_to_src_and_size,
                # This path convention must match how `write_yum_dnf_conf.py`
                # and `rpm_repo_snapshot.bzl` set up their output.
                snapshot_dir / f'{prog}/etc/{prog}/plugins/versionlock.list',
                (src, len(envras))
            )
        yield dest_to_src_and_size
Esempio n. 2
0
def _normalize_snapshot_to_versionlock(
    snapshot_to_vl: Iterable[Tuple[Path, Path]],
    snapshots: Iterable[Path],
) -> Mapping[Path, Path]:
    snapshots = frozenset(snapshots)
    s_to_vl = {}
    for s, vl in snapshot_to_vl:
        assert s in snapshots, (s, snapshots)
        set_new_key(s_to_vl, s, vl)
    return MappingProxyType(s_to_vl)
Esempio n. 3
0
 def _memoize_error(self, obj, error: ReportableError):
     '''
     Any size or checksum errors we see are likely to be permanent, so we
     MUTATE `obj` with the error, hiding the old `storage_id` inside.
     '''
     error_dict = {
         **error.to_dict(),
         # Since `storage_id` is hidden, `send_head` will show the error.
         'storage_id': obj.pop('storage_id'),
     }
     set_new_key(obj, 'error', error_dict)
Esempio n. 4
0
    def _download_rpms(self, rpms: Iterable[Rpm], shard: RpmShard):
        log.info(f'''`{self._repo_name}` has {len(rpms)} RPMs weighing {
            sum(r.size for r in rpms)
        :,} bytes''')
        storage_id_to_rpm = {}
        # Download in random order to reduce collisions from racing writers.
        for rpm in shuffled(rpms):
            if not shard.in_shard(rpm):
                continue
            with self._repo_db_ctx as db:
                # If we get no `storage_id` back, there are 3 possibilities:
                #  - `rpm.filename` was never seen before.
                #  - `rpm.filename` was seen before, but it was hashed with
                #     different algorithm(s), so we MUST download and
                #     compute the canonical checksum to know if its contents
                #     are the same.
                #  - `rpm.filename` was seen before, **AND** one of the
                #    prior checksums used `rpm.checksum.algorithms`, but
                #    produced a different hash value.  In other words, this
                #    is a `MutableRpmError`, because the same filename must
                #    have had two different contents.  We COULD explicitly
                #    detect this error here, and avoid the download.
                #    However, this severe error should be infrequent, and we
                #    actually get valuable information from the download --
                #    this lets us know whether the file is wrong or the
                #    repodata is wrong.
                storage_id, canonical_checksum = \
                    db.get_rpm_storage_id_and_checksum(self._rpm_table, rpm)
            # If the RPM is already stored with a matching checksum, just
            # update its `.canonical_checksum`.
            if storage_id:
                rpm = rpm._replace(canonical_checksum=canonical_checksum)
            else:  # We have to download the RPM.
                try:
                    with _reportable_http_errors(rpm.location):
                        storage_id, rpm = self._download_rpm(rpm)
                # IMPORTANT: All the classes of errors that we handle below
                # have the property that we would not have stored anything
                # new in the DB, meaning that such failed RPMs will be
                # retried on the next snapshot attempt.
                except ReportableError as ex:
                    # RPM checksum validation errors, scenarios where the
                    # same RPM name occurs with different checksums, etc.
                    storage_id = ex

            # Detect if this RPM filename occurs with different contents.
            if not isinstance(storage_id, ReportableError):
                storage_id = self._detect_mutable_rpms(rpm, storage_id)

            set_new_key(storage_id_to_rpm, storage_id, rpm)

        assert len(storage_id_to_rpm) == sum(shard.in_shard(r) for r in rpms)
        return storage_id_to_rpm
Esempio n. 5
0
def _download_repodatas(
        repo: YumDnfConfRepo, repomd: RepoMetadata,
        cfg: DownloadConfig) -> Tuple[Set[Rpm], Mapping[str, Repodata]]:
    rpms = None  # We'll extract these from the primary repodata
    storage_id_to_repodata = {}  # Newly stored **and** pre-existing
    repodata_table = RepodataTable()
    primary_repodata = pick_primary_repodata(repomd.repodatas)
    log_size(f"`{repo.name}` repodata weighs",
             sum(rd.size for rd in repomd.repodatas))
    rw_db_conn = cfg.new_db_conn(readonly=False)
    with ThreadPoolExecutor(max_workers=cfg.threads) as executor:
        futures = [
            executor.submit(
                _download_repodata,
                repodata,
                repo_url=repo.base_url,
                repodata_table=repodata_table,
                cfg=cfg,
                is_primary=repodata is primary_repodata,
            ) for repodata in shuffled(repomd.repodatas)
        ]

        for future in as_completed(futures):
            res = future.result()
            if res.newly_stored:
                # Don't want to store errors into the repo db -- this should
                # never be the case as `newly_stored` is only True when we
                # successfully commit a new repodata to storage
                assert not isinstance(res.storage_id, ReportableError)
                # This repodata was newly downloaded and stored in storage, so
                # we store its storage_id to repo_db regardless of whether we
                # encounter fatal errors later on that fail the snapshot; see
                # docblock in `repo_downloader.py` for reasoning
                storage_id = maybe_write_id(res.repodata, res.storage_id,
                                            repodata_table, rw_db_conn)
            else:
                storage_id = res.storage_id
            if res.maybe_rpms is not None:
                # RPMs will only have been returned by the primary, thus we
                # should only enter this block once
                assert rpms is None
                # Convert to a set to work around buggy repodatas, which
                # list the same RPM object twice.
                rpms = frozenset(res.maybe_rpms)
            set_new_key(storage_id_to_repodata, storage_id, res.repodata)
    # It's possible that for non-primary repodatas we received errors when
    # downloading - in that case we store the error in the sqlite db, thus the
    # dict should contain an entry for every single repodata
    assert len(storage_id_to_repodata) == len(repomd.repodatas)
    if not rpms:
        log.warning(f"Repo {repo} has no RPMs")
    return rpms, storage_id_to_repodata
Esempio n. 6
0
    def _download_repodatas(
        self,
        repomd: RepoMetadata,
        # We mutate this dictionary on-commit to allow the caller to clean
        # up any stored repodata blobs if the download fails part-way.
        persist_storage_id_to_repodata: Mapping[str, Repodata],
        visitors: Iterable['RepoObjectVisitor'],
    ) -> Tuple[Set[Rpm], Mapping[str, Repodata]]:
        rpms = None  # We'll extract these from the primary repodata
        storage_id_to_repodata = {}  # Newly stored **and** pre-existing
        primary_repodata = pick_primary_repodata(repomd.repodatas)
        log.info(f'''`{self._repo_name}` repodata weighs {
            sum(rd.size for rd in repomd.repodatas)
        :,} bytes''')
        # Visitors see all declared repodata, even if some downloads fail.
        for visitor in visitors:
            for repodata in repomd.repodatas:
                visitor.visit_repodata(repodata)
        # Download in random order to reduce collisions from racing writers.
        for repodata in shuffled(repomd.repodatas):
            try:
                with _reportable_http_errors(repodata.location):
                    newly_stored, storage_id, maybe_rpms = \
                        self._download_repodata(
                            repodata, is_primary=repodata is primary_repodata,
                        )
                if newly_stored:
                    set_new_key(
                        persist_storage_id_to_repodata,
                        storage_id,
                        repodata,
                    )
                if maybe_rpms is not None:
                    # Convert to a set to work around buggy repodatas, which
                    # list the same RPM object twice.
                    rpms = set(maybe_rpms)
            except ReportableError as ex:
                # We cannot proceed without the primary file -- raise here
                # to trigger the "top-level retry" in the snapshot driver.
                if repodata is primary_repodata:
                    raise
                # This fake "storage ID" is not written to
                # `persist_storage_id_to_repodata`, so we will never attempt
                # to write it to the DB.  However, it does end up in
                # `repodata.json`, so the error is visible.
                storage_id = ex
            set_new_key(storage_id_to_repodata, storage_id, repodata)

        assert len(storage_id_to_repodata) == len(repomd.repodatas)
        assert rpms, 'Is the repo empty?'
        return rpms, storage_id_to_repodata
Esempio n. 7
0
    def _join_snapshots(self, snapshots: List[RepoSnapshot]) -> RepoSnapshot:
        # repomd & repodata should be the same across all shards
        repomd = snapshots[0].repomd
        storage_id_to_repodata = snapshots[0].storage_id_to_repodata
        storage_id_to_rpm = {}

        for snapshot in snapshots:
            self._check_repomd_equal(repomd, snapshot.repomd)
            self.assertEqual(
                storage_id_to_repodata, snapshot.storage_id_to_repodata,
            )
            for sid, rpm in snapshot.storage_id_to_rpm.items():
                set_new_key(storage_id_to_rpm, sid, rpm)

        return RepoSnapshot(
            repomd=repomd,
            storage_id_to_repodata=storage_id_to_repodata,
            storage_id_to_rpm=storage_id_to_rpm,
        )
Esempio n. 8
0
def add_snapshot_db_objs(db):
    location_to_obj = {}
    for repo, build_timestamp, metadata_xml in db.execute('''
    SELECT "repo", "build_timestamp", "metadata_xml" FROM "repomd"
    ''').fetchall():
        set_new_key(
            location_to_obj, os.path.join(repo, 'repodata/repomd.xml'), {
                'size': len(metadata_xml),
                'build_timestamp': build_timestamp,
                'content_bytes': metadata_xml.encode(),
            })
    for table in ['repodata', 'rpm']:
        for (
                repo,
                path,
                build_timestamp,
                checksum,
                error,
                error_json,
                size,
                storage_id,
        ) in db.execute(f'''
        SELECT
            "repo", "path", "build_timestamp", "checksum", "error",
            "error_json", "size", "storage_id"
        FROM "{table}"
        ''').fetchall():
            obj = {
                'checksum': checksum,
                'size': size,
                'build_timestamp': build_timestamp,
            }
            # `storage_id` is populated in the DB table for `mutable_rpm`
            # errors, but we don't want to serve up those files.
            if storage_id and not error and not error_json:
                obj['storage_id'] = storage_id
            elif error and error_json:
                obj['error'] = {'error': error, **json.loads(error_json)}
            else:  # pragma: no cover
                raise AssertionError(f'{storage_id} {error} {error_json}')
            set_new_key(location_to_obj, os.path.join(repo, path), obj)
    return location_to_obj
Esempio n. 9
0
def read_snapshot_dir(path: str):
    location_to_obj = {}
    for repo in os.listdir(path):
        if repo == 'yum.conf':
            continue
        repo_path = Path(path) / repo

        for filename in ['rpm.json', 'repodata.json']:
            with open(repo_path / filename) as infile:
                for location, obj in json.load(infile).items():
                    set_new_key(
                        location_to_obj, os.path.join(repo, location), obj
                    )

        # Re-parse and serialize the metadata to a format that ALMOST
        # matches the other blobs (imitating `RepoSnapshot.to_directory()`).
        # If useful, it would not be offensive to make such a `repomd.json`
        # be emitted by RepoSnapshot, instead of `repomd.xml`.  Caveat: JSON
        # isn't suitable for bytes, and the XML is currently bytes.
        with open(repo_path / 'repomd.xml', 'rb') as infile:
            repomd = RepoMetadata.new(xml=infile.read())
        location_to_obj[os.path.join(repo, 'repodata/repomd.xml')] = {
            'size': repomd.size,
            'build_timestamp': repomd.build_timestamp,
            'content_bytes': repomd.xml,  # Instead of `storage_id`
        }

        # Similarly, make JSON metadata for the repo's GPG keys.
        key_dir = repo_path / 'gpg_keys'
        for key_filename in os.listdir(key_dir.decode()):
            with open(key_dir / key_filename, 'rb') as infile:
                key_content = infile.read()
            location_to_obj[os.path.join(repo, key_filename)] = {
                'size': len(key_content),
                # We don't have a good timestamp for these, so set it to
                # "now".  Caching efficiency losses should be negligible :)
                'build_timestamp': int(time.time()),
                'content_bytes': key_content,  # Instead of `storage_id`
            }

    return location_to_obj
 def wrapped_popen(opts: _NspawnOpts, popen_args: PopenArgs):
     with ExitStack() as stack:
         dest_to_src = {}
         for snapshot, versionlock in snapshot_to_versionlock.items():
             for dest, (src, vl_size) in stack.enter_context(
                 _prepare_versionlock_lists(
                     # Same note as in `inject_repo_servers.py` regarding
                     # the usage of the pre-snapshot subvolume.
                     opts.layer, snapshot, versionlock,
                 )
             ).items():
                 log.info(f'Locking {vl_size} RPM versions via {dest}')
                 set_new_key(dest_to_src, dest, src)
         yield stack.enter_context(popen(
             opts._replace(
                 bindmount_ro=(*opts.bindmount_ro, *(
                     (s, d) for d, s in dest_to_src.items()
                 )),
             ),
             popen_args,
         ))
Esempio n. 11
0
 def _commit_repodata_and_cancel_cleanup(
     self,
     repomd: RepoMetadata,
     # We'll replace our IDs by those that actually ended up in the DB
     storage_id_to_repodata: Mapping[str, Repodata],
     # Will retain only those IDs that are unused by the DB and need cleanup
     persist_storage_id_to_repodata: Mapping[str, Repodata],
 ):
     with self._repo_db_ctx as repo_db:
         # We cannot touch `persist_storage_id_to_repodata` in the loop
         # because until the transaction commits, we must be ready to
         # delete all new storage IDs.  So instead, we will construct the
         # post-commit version of that dictionary (i.e. blobs we need to
         # delete even if the transaction lands), in this variable:
         unneeded_storage_id_to_repodata = {}
         for storage_id, repodata in persist_storage_id_to_repodata.items():
             assert not isinstance(storage_id, ReportableError), repodata
             db_storage_id = repo_db.maybe_store(self._repodata_table,
                                                 repodata, storage_id)
             _log_if_storage_ids_differ(repodata, storage_id, db_storage_id)
             if db_storage_id != storage_id:
                 set_new_key(
                     storage_id_to_repodata,
                     db_storage_id,
                     storage_id_to_repodata.pop(storage_id),
                 )
                 set_new_key(
                     unneeded_storage_id_to_repodata,
                     storage_id,
                     repodata,
                 )
         repo_db.store_repomd(self._repo_name, repomd)
         repo_db.commit()
         # The DB commit was successful, and we're about to exit the
         # repo_db context, which might, at worst, raise its own error.
         # Therefore, let's prevent the `finally` cleanup from deleting
         # the blobs whose IDs we just committed to the DB.
         persist_storage_id_to_repodata.clear()
         persist_storage_id_to_repodata.update(
             unneeded_storage_id_to_repodata)
    def _reduce_equal_snapshots(
        self, repo_snapshots: List[Tuple[YumDnfConfRepo,
                                         RepoSnapshot]]) -> RepoSnapshot:
        self.assertGreater(len(repo_snapshots), 0)
        # repo, repomd & repodata should be the same across all shards
        head_repo, head_snapshot = repo_snapshots[0]
        head_repomd = head_snapshot.repomd
        head_storage_id_to_repodata = head_snapshot.storage_id_to_repodata
        storage_id_to_rpm = {}

        for repo, snapshot in repo_snapshots[1:]:
            self.assertEqual(head_repo, repo)
            self._check_repomd_equal(head_repomd, snapshot.repomd)
            self.assertEqual(head_storage_id_to_repodata,
                             snapshot.storage_id_to_repodata)
            for sid, rpm in snapshot.storage_id_to_rpm.items():
                set_new_key(storage_id_to_rpm, sid, rpm)

        return RepoSnapshot(
            repomd=head_repomd,
            storage_id_to_repodata=head_storage_id_to_repodata,
            storage_id_to_rpm=storage_id_to_rpm,
        )
Esempio n. 13
0
def _download_rpms(
    repo: YumDnfConfRepo,
    rpm_table: RpmTable,
    rpms: Iterable[Rpm],
    all_snapshot_universes: Set[str],
    cfg: DownloadConfig,
):
    log_size(f"`{repo.name}` has {len(rpms)} RPMs weighing",
             sum(r.size for r in rpms))
    storage_id_to_rpm = {}
    rw_db_conn = cfg.new_db_conn(readonly=False)
    ro_db_conn = cfg.new_db_conn(readonly=True)
    with ThreadPoolExecutor(max_workers=cfg.threads) as executor:
        futures = [
            executor.submit(_handle_rpm, rpm, repo.base_url, rpm_table,
                            all_snapshot_universes, cfg)
            # Download in random order to reduce collisions from racing writers.
            for rpm in shuffled(rpms) if cfg.rpm_shard.in_shard(rpm)
        ]
        for future in as_completed(futures):
            rpm, res_storage_id = future.result()
            if not isinstance(res_storage_id, ReportableError):
                # If it's valid, we store this storage_id to repo_db regardless of
                # whether we encounter fatal errors later on in the execution and
                # don't finish the snapshot - see top-level docblock for reasoning
                res_storage_id = maybe_write_id(rpm, res_storage_id, rpm_table,
                                                rw_db_conn)
                # Detect if this RPM NEVRA occurs with different contents.
                res_storage_id = _detect_mutable_rpms(rpm, rpm_table,
                                                      res_storage_id,
                                                      all_snapshot_universes,
                                                      ro_db_conn)
            set_new_key(storage_id_to_rpm, res_storage_id, rpm)

    assert len(storage_id_to_rpm) == sum(
        cfg.rpm_shard.in_shard(r) for r in rpms)
    return storage_id_to_rpm