Ejemplo n.º 1
0
def get_jobs(pypi_fetcher_dir, bucket, processed, amount=1000):
    pypi_dict = LazyBucketDict(f"{pypi_fetcher_dir}/pypi",
                               restrict_to_bucket=bucket)
    jobs = []
    names = list(pypi_dict.by_bucket(bucket).keys())
    total_nr = 0
    for pkg_name in names:
        for ver, release_types in pypi_dict[pkg_name].items():
            if 'sdist' not in release_types:
                continue
            if (pkg_name, ver) in processed:
                continue
            total_nr += 1
            release = release_types['sdist']
            if len(jobs) <= amount:
                jobs.append(
                    PackageJob(
                        bucket,
                        pkg_name,
                        ver,
                        f"https://files.pythonhosted.org/packages/source/{pkg_name[0]}/{pkg_name}/{release[1]}",
                        release[0],
                        0,
                    ))
    shuffle(jobs)
    for i, job in enumerate(jobs):
        job.idx = i
    print(
        f"Bucket {bucket}: Planning execution of {len(jobs)} jobs out of {total_nr} total jobs for this bucket"
    )
    return jobs
Ejemplo n.º 2
0
def purge(pypi_index, pkgs_dict: LazyBucketDict, bucket, py_vers):
    # purge all versions which are not on pypi anymore
    for name, vers in pkgs_dict.by_bucket(bucket).copy().items():
        if name not in pypi_index:
            print(
                f"deleting package {name} from DB because it has been removed from pypi"
            )
            del pkgs_dict[name]
            continue
        for ver in tuple(vers.keys()):
            if ver not in pypi_index[name]:
                print(
                    f"deleting package {name} version {ver} from DB because it has been removed from pypi"
                )
                del pkgs_dict[name][ver]
    # purge old python versions
    for name, vers in pkgs_dict.by_bucket(bucket).copy().items():
        for ver, pyvers in vers.copy().items():
            for pyver in tuple(pyvers.keys()):
                if pyver not in py_vers:
                    print(
                        f"deleting package {name} version {ver} for python {pyver}"
                        f" from DB because we dropped support for this python version"
                    )
                    del pkgs_dict[name][ver][pyver]
            if len(pkgs_dict[name][ver]) == 0:
                print(
                    f"deleting package {name} version {ver} from DB"
                    f" because it is not compatible with any of our supported python versions"
                )
                del pkgs_dict[name][ver]
        if len(pkgs_dict[name]) == 0:
            print(
                f"deleting package {name} from DB"
                f" because it has no releases left which are compatible with any of our supported python versions"
            )
            del pkgs_dict[name]
Ejemplo n.º 3
0
def get_jobs(bucket, pypi_dict: LazyBucketDict, dump_dict: LazyBucketDict):
    names = list(pypi_dict.by_bucket(bucket).keys())
    jobs = []
    for pkg_name in names:
        for ver, release_types in pypi_dict[pkg_name].items():
            if 'wheels' not in release_types:
                continue
            for filename, data in release_types['wheels'].items():
                pyver = data[1]
                if is_done(dump_dict, pkg_name, ver, pyver, filename):
                    continue
                url = construct_url(pkg_name, pyver, filename)
                jobs.append(
                    dict(name=pkg_name,
                         ver=ver,
                         filename=filename,
                         pyver=pyver,
                         url=url,
                         bucket=bucket))
    shuffle(jobs)
    return [Job(**j, nr=idx) for idx, j in enumerate(jobs)]
Ejemplo n.º 4
0
def main():
    # settings related to performance/parallelization
    amount_buckets = int(os.environ.get('AMOUNT_BUCKETS', "256"))
    limit_names = set(
        filter(lambda n: bool(n),
               os.environ.get('LIMIT_NAMES', "").split(',')))
    max_minutes = int(os.environ.get('MAX_MINUTES', "0"))
    bucket_jobs = int(os.environ.get('BUCKET_JOBS', "0"))
    start_bucket = int(os.environ.get('BUCKET_START', "0"))
    workers = int(os.environ.get('WORKERS', multiprocessing.cpu_count() * 2))

    # general settings
    dump_dir = os.environ.get('DUMP_DIR', "./sdist")
    extractor_src = os.environ.get("EXTRACTOR_SRC")
    if not extractor_src:
        raise Exception(
            "Set env variable 'EXTRACTOR_SRC to {mach-nix}/lib/extractor'")
    min_free_gb = int(os.environ.get('MIN_FREE_GB', "0"))
    py_vers_short = os.environ.get('PYTHON_VERSIONS',
                                   "27,36,37,38,39,310").strip().split(',')
    pypi_fetcher_dir = os.environ.get('PYPI_FETCHER', '/tmp/pypi_fetcher')
    store = os.environ.get('STORE', None)

    deadline_total = time() + max_minutes * 60 if max_minutes else None

    # cache build time deps, otherwise first job will be slow
    with Measure("ensure build time deps"):
        build_base(extractor_src, py_vers_short, store=store)

    garbage_collected = False

    for idx, bucket in enumerate(LazyBucketDict.bucket_keys()):
        # calculate per bucket deadline if MAX_MINUTES is used
        if deadline_total:
            amount = min(amount_buckets, 256 - start_bucket)
            deadline = time() + (deadline_total - time()) / amount
        else:
            deadline = None
        if idx < start_bucket or idx >= start_bucket + amount_buckets:
            continue
        pkgs_dict = LazyBucketDict(dump_dir, restrict_to_bucket=bucket)
        pypi_index = LazyBucketDict(f"{pypi_fetcher_dir}/pypi",
                                    restrict_to_bucket=bucket)
        # load error data
        error_dict = LazyBucketDict(dump_dir + "-errors",
                                    restrict_to_bucket=bucket)
        decompress(error_dict.by_bucket(bucket))
        with Measure('Get processed pkgs'):
            print(
                f"DB contains {len(list(pkgs_dict.keys()))} pkgs at this time for bucket {bucket}"
            )
        with Measure("decompressing data"):
            decompress(pkgs_dict.by_bucket(bucket))
        # purge data for old python versions and packages which got deleted from pypi
        with Measure("purging packages"):
            purge(pypi_index, pkgs_dict, bucket, py_vers_short)
        with Measure("getting jobs"):
            jobs = get_jobs(pypi_index,
                            error_dict,
                            pkgs_dict,
                            bucket,
                            py_vers_short,
                            limit_num=bucket_jobs,
                            limit_names=limit_names)
            if not jobs:
                continue
            compute_drvs(jobs, extractor_src, store=store)

        # ensure that all the build time dependencies are cached before starting,
        # otherwise jobs might time out
        if garbage_collected:
            with Measure("ensure build time deps"):
                build_base(extractor_src, py_vers_short, store=store)
        with Measure('executing jobs'):
            if workers > 1:
                pool_results = utils.parallel(extract_requirements,
                                              (jobs, (deadline, ) * len(jobs),
                                               (len(jobs), ) * len(jobs),
                                               (store, ) * len(jobs)),
                                              workers=workers,
                                              use_processes=False)
            else:
                pool_results = [
                    extract_requirements(args, deadline, store)
                    for args in jobs
                ]

        # filter out exceptions
        results = []
        for i, res in enumerate(pool_results):
            if not isinstance(res, Exception):
                for r in res:
                    results.append(r)

        # insert new data
        for pkg in sorted(results,
                          key=lambda pkg:
                          (pkg.name, pkg.version, sort_key_pyver(pkg.py_ver))):
            py_ver = ''.join(filter(lambda c: c.isdigit(), pkg.py_ver))
            if pkg.error:
                target = error_dict
            else:
                target = pkgs_dict
            insert(py_ver,
                   pkg.name,
                   pkg.version,
                   pkg_to_dict(pkg),
                   target,
                   error=pkg.error)

        # compress and save
        with Measure("compressing data"):
            compress(pkgs_dict.by_bucket(bucket))
            compress(error_dict.by_bucket(bucket))
        print("finished compressing data")
        with Measure("saving data"):
            pkgs_dict.save()
            error_dict.save()

        # collect garbage if free space < MIN_FREE_GB
        if shutil.disk_usage(store or "/nix/store").free / (1000**
                                                            3) < min_free_gb:
            with Measure("collecting nix store garbage"):
                sp.run(
                    f"nix-collect-garbage {f'--store {store}' if store else ''}",
                    capture_output=True,
                    shell=True)
                garbage_collected = True

        # stop execution if deadline occurred
        if deadline_total and time() > deadline_total:
            print(
                f"Deadline occurred. Stopping execution. Last Bucket was {bucket}"
            )
            break