Esempio n. 1
0
def _download_repodatas(
        repo: YumDnfConfRepo, repomd: RepoMetadata,
        cfg: DownloadConfig) -> Tuple[Set[Rpm], Mapping[str, Repodata]]:
    rpms = None  # We'll extract these from the primary repodata
    storage_id_to_repodata = {}  # Newly stored **and** pre-existing
    repodata_table = RepodataTable()
    primary_repodata = pick_primary_repodata(repomd.repodatas)
    log_size(f"`{repo.name}` repodata weighs",
             sum(rd.size for rd in repomd.repodatas))
    rw_db_conn = cfg.new_db_conn(readonly=False)
    with ThreadPoolExecutor(max_workers=cfg.threads) as executor:
        futures = [
            executor.submit(
                _download_repodata,
                repodata,
                repo_url=repo.base_url,
                repodata_table=repodata_table,
                cfg=cfg,
                is_primary=repodata is primary_repodata,
            ) for repodata in shuffled(repomd.repodatas)
        ]

        for future in as_completed(futures):
            res = future.result()
            if res.newly_stored:
                # Don't want to store errors into the repo db -- this should
                # never be the case as `newly_stored` is only True when we
                # successfully commit a new repodata to storage
                assert not isinstance(res.storage_id, ReportableError)
                # This repodata was newly downloaded and stored in storage, so
                # we store its storage_id to repo_db regardless of whether we
                # encounter fatal errors later on that fail the snapshot; see
                # docblock in `repo_downloader.py` for reasoning
                storage_id = maybe_write_id(res.repodata, res.storage_id,
                                            repodata_table, rw_db_conn)
            else:
                storage_id = res.storage_id
            if res.maybe_rpms is not None:
                # RPMs will only have been returned by the primary, thus we
                # should only enter this block once
                assert rpms is None
                # Convert to a set to work around buggy repodatas, which
                # list the same RPM object twice.
                rpms = frozenset(res.maybe_rpms)
            set_new_key(storage_id_to_repodata, storage_id, res.repodata)
    # It's possible that for non-primary repodatas we received errors when
    # downloading - in that case we store the error in the sqlite db, thus the
    # dict should contain an entry for every single repodata
    assert len(storage_id_to_repodata) == len(repomd.repodatas)
    if not rpms:
        log.warning(f"Repo {repo} has no RPMs")
    return rpms, storage_id_to_repodata
Esempio n. 2
0
def gen_rpms_from_repodatas(
    repodata_results: Iterable[DownloadResult],
    cfg: DownloadConfig,
    all_snapshot_universes: FrozenSet[str],
) -> Iterator[DownloadResult]:
    for res in repodata_results:
        with timeit(f"Downloading RPMs for repo {res.repo}"):
            storage_id_to_rpm, total_dl = _download_rpms(
                res.repo,
                RpmTable(res.repo_universe),
                res.rpms,
                all_snapshot_universes,
                cfg,
            )
            log_size(f"Repo {res.repo} downloaded ", total_dl)
        yield res._replace(
            storage_id_to_rpm=MappingProxyType(storage_id_to_rpm))
Esempio n. 3
0
def _download_rpms(
    repo: YumDnfConfRepo,
    rpm_table: RpmTable,
    rpms: Iterable[Rpm],
    all_snapshot_universes: Set[str],
    cfg: DownloadConfig,
):
    log_size(f"`{repo.name}` has {len(rpms)} RPMs weighing",
             sum(r.size for r in rpms))
    storage_id_to_rpm = {}
    rw_db_conn = cfg.new_db_conn(readonly=False)
    ro_db_conn = cfg.new_db_conn(readonly=True)
    with ThreadPoolExecutor(max_workers=cfg.threads) as executor:
        futures = [
            executor.submit(_handle_rpm, rpm, repo.base_url, rpm_table,
                            all_snapshot_universes, cfg)
            # Download in random order to reduce collisions from racing writers.
            for rpm in shuffled(rpms) if cfg.rpm_shard.in_shard(rpm)
        ]
        for future in as_completed(futures):
            rpm, res_storage_id = future.result()
            if not isinstance(res_storage_id, ReportableError):
                # If it's valid, we store this storage_id to repo_db regardless of
                # whether we encounter fatal errors later on in the execution and
                # don't finish the snapshot - see top-level docblock for reasoning
                res_storage_id = maybe_write_id(rpm, res_storage_id, rpm_table,
                                                rw_db_conn)
                # Detect if this RPM NEVRA occurs with different contents.
                res_storage_id = _detect_mutable_rpms(rpm, rpm_table,
                                                      res_storage_id,
                                                      all_snapshot_universes,
                                                      ro_db_conn)
            set_new_key(storage_id_to_rpm, res_storage_id, rpm)

    assert len(storage_id_to_rpm) == sum(
        cfg.rpm_shard.in_shard(r) for r in rpms)
    return storage_id_to_rpm
Esempio n. 4
0
def _download_rpms(
    repo: YumDnfConfRepo,
    rpm_table: RpmTable,
    rpms: Iterable[Rpm],
    all_snapshot_universes: Set[str],
    cfg: DownloadConfig,
) -> Tuple[Dict[MaybeStorageID, Rpm], float]:
    log_size(f"`{repo.name}` has {len(rpms)} RPMs weighing",
             sum(r.size for r in rpms))
    storage_id_to_rpm = {}
    duplicate_rpms = 0
    rw_db_conn = cfg.new_db_conn(readonly=False)
    ro_db_conn = cfg.new_db_conn(readonly=True)
    min_thread_bw = MIN_TOTAL_BW / cfg.threads
    total_bytes_downloaded = 0
    with ThreadPoolExecutor(max_workers=cfg.threads) as executor:
        futures = [
            executor.submit(
                _handle_rpm,
                rpm,
                repo.base_url,
                rpm_table,
                all_snapshot_universes,
                cfg,
                min_thread_bw,
            )
            # Download in random order to reduce collisions from racing writers.
            for rpm in shuffled(rpms) if cfg.rpm_shard.in_shard(rpm)
        ]
        for future in as_completed(futures):
            rpm, res_storage_id, bytes_dl = future.result()
            total_bytes_downloaded += bytes_dl
            if not isinstance(res_storage_id, ReportableError):
                # If it's valid, we store this storage_id in repo_db regardless of
                # whether we encounter fatal errors later on that fail the snapshot;
                # see docblock in `repo_downloader.py` for reasoning
                res_storage_id = maybe_write_id(rpm, res_storage_id, rpm_table,
                                                rw_db_conn)
                # Detect if this RPM NEVRA occurs with different contents.
                with timeit(f"Detecting mutable RPMs for {rpm}",
                            threshold_s=10):
                    res_storage_id = _detect_mutable_rpms(
                        rpm, rpm_table, res_storage_id, all_snapshot_universes,
                        ro_db_conn)
            existing_rpm = storage_id_to_rpm.get(res_storage_id)
            if existing_rpm and existing_rpm != rpm:  # pragma: no cover
                duplicate_rpms += 1
                message = (f'Same ID {res_storage_id} with differing RPMs: '
                           f'{existing_rpm} != {rpm}')
                # We don't care if locations diverge because we only need a single
                # location for a NEVRA to be able to fetch the RPM.
                if existing_rpm._replace(location=None) == rpm._replace(
                        location=None):
                    log.warning(message)
                else:
                    raise RuntimeError(message)
            storage_id_to_rpm[res_storage_id] = rpm

    assert (len(storage_id_to_rpm) == (
        sum(cfg.rpm_shard.in_shard(r) for r in rpms) - duplicate_rpms))
    return storage_id_to_rpm, total_bytes_downloaded