Example #1
0
def crawl_pkgs_meta(packages, target_dir, workers):
    pkgs_dict = LazyBucketDict(target_dir)
    args_list = [(name, pkgs_dict) for name in packages]
    if workers > 1:
        utils.parallel(save_pkg_meta, zip(*args_list), workers=workers)
    else:
        [save_pkg_meta(*args) for args in args_list]
    pkgs_dict.save()
Example #2
0
def main():
    dump_dir = sys.argv[1]
    for bucket_key, key_set in get_names_per_bucket().items():
        pkgs_dict = LazyBucketDict(f"{dump_dir}", restrict_to_bucket=bucket_key)
        pkgs = P.select(
            P.id,
            P.name,
            P.version,
            P.py_ver,
            P.install_requires,
            P.setup_requires,
            P.extras_require,
            P.tests_require,
            P.python_requires,
        ).where(P.error.is_null(), P.name.in_(key_set))
        print(f'dumping bucket {bucket_key}')
        for pkg in sorted(pkgs, key=lambda pkg: (pkg.name, pkg.version, pkg.py_ver)):
            py_ver = ''.join(filter(lambda c: c.isdigit(), pkg.py_ver))
            insert(py_ver, pkg.name, pkg.version, pkg_to_dict(pkg), pkgs_dict)
        compress(pkgs_dict)
        pkgs_dict.save()
Example #3
0
def main():
    dump_dir = sys.argv[1]
    workers = int(os.environ.get('WORKERS', "1"))
    pypi_fetcher_dir = os.environ.get('pypi_fetcher')
    print(f'Index directory: {pypi_fetcher_dir}')
    assert isdir(pypi_fetcher_dir)
    for bucket in LazyBucketDict.bucket_keys():
        pypi_dict = LazyBucketDict(f"{pypi_fetcher_dir}/pypi")
        dump_dict = LazyBucketDict(dump_dir, restrict_to_bucket=bucket)
        print(f"Prune bucket {bucket}")
        prune_entries(bucket, pypi_dict, dump_dict)
        pypi_dict.save()
        dump_dict.save()
        print(f"Calculating jobs for bucket {bucket}")
        jobs = list(get_jobs(bucket, pypi_dict, dump_dict))
        if not jobs:
            continue
        print(f"Starting batch with {len(jobs)} jobs")
        func = mine_wheel_metadata_full_download
        if workers > 1:

            def f(job):
                return exec_or_return_exc(func, job)

            result = parallel(f, (jobs, ), workers=workers)
        else:
            result = [exec_or_return_exc(func, job) for job in jobs]
        for r in result:
            if isinstance(r, Exception):
                continue
            name = r.job.name
            ver = r.job.ver
            pyver = r.job.pyver
            fn = r.job.filename
            if name not in dump_dict:
                dump_dict[name] = {}
            if pyver not in dump_dict[name]:
                dump_dict[name][pyver] = {}
            if ver not in dump_dict[name][pyver]:
                dump_dict[name][pyver][ver] = {}
            dump_dict[name][pyver][ver][fn] = {}
            for key in ('requires_dist', 'provides_extras',
                        'requires_external', 'requires_python'):
                val = getattr(r, key)
                if val:
                    dump_dict[name][pyver][ver][fn][key] = val
        compress(dump_dict)
        dump_dict.save()
def main():
    # settings related to performance/parallelization
    amount_buckets = int(os.environ.get('AMOUNT_BUCKETS', "256"))
    limit_names = set(
        filter(lambda n: bool(n),
               os.environ.get('LIMIT_NAMES', "").split(',')))
    max_minutes = int(os.environ.get('MAX_MINUTES', "0"))
    bucket_jobs = int(os.environ.get('BUCKET_JOBS', "0"))
    start_bucket = int(os.environ.get('BUCKET_START', "0"))
    workers = int(os.environ.get('WORKERS', multiprocessing.cpu_count() * 2))

    # general settings
    dump_dir = os.environ.get('DUMP_DIR', "./sdist")
    extractor_src = os.environ.get("EXTRACTOR_SRC")
    if not extractor_src:
        raise Exception(
            "Set env variable 'EXTRACTOR_SRC to {mach-nix}/lib/extractor'")
    min_free_gb = int(os.environ.get('MIN_FREE_GB', "0"))
    py_vers_short = os.environ.get('PYTHON_VERSIONS',
                                   "27,36,37,38,39,310").strip().split(',')
    pypi_fetcher_dir = os.environ.get('PYPI_FETCHER', '/tmp/pypi_fetcher')
    store = os.environ.get('STORE', None)

    deadline_total = time() + max_minutes * 60 if max_minutes else None

    # cache build time deps, otherwise first job will be slow
    with Measure("ensure build time deps"):
        build_base(extractor_src, py_vers_short, store=store)

    garbage_collected = False

    for idx, bucket in enumerate(LazyBucketDict.bucket_keys()):
        # calculate per bucket deadline if MAX_MINUTES is used
        if deadline_total:
            amount = min(amount_buckets, 256 - start_bucket)
            deadline = time() + (deadline_total - time()) / amount
        else:
            deadline = None
        if idx < start_bucket or idx >= start_bucket + amount_buckets:
            continue
        pkgs_dict = LazyBucketDict(dump_dir, restrict_to_bucket=bucket)
        pypi_index = LazyBucketDict(f"{pypi_fetcher_dir}/pypi",
                                    restrict_to_bucket=bucket)
        # load error data
        error_dict = LazyBucketDict(dump_dir + "-errors",
                                    restrict_to_bucket=bucket)
        decompress(error_dict.by_bucket(bucket))
        with Measure('Get processed pkgs'):
            print(
                f"DB contains {len(list(pkgs_dict.keys()))} pkgs at this time for bucket {bucket}"
            )
        with Measure("decompressing data"):
            decompress(pkgs_dict.by_bucket(bucket))
        # purge data for old python versions and packages which got deleted from pypi
        with Measure("purging packages"):
            purge(pypi_index, pkgs_dict, bucket, py_vers_short)
        with Measure("getting jobs"):
            jobs = get_jobs(pypi_index,
                            error_dict,
                            pkgs_dict,
                            bucket,
                            py_vers_short,
                            limit_num=bucket_jobs,
                            limit_names=limit_names)
            if not jobs:
                continue
            compute_drvs(jobs, extractor_src, store=store)

        # ensure that all the build time dependencies are cached before starting,
        # otherwise jobs might time out
        if garbage_collected:
            with Measure("ensure build time deps"):
                build_base(extractor_src, py_vers_short, store=store)
        with Measure('executing jobs'):
            if workers > 1:
                pool_results = utils.parallel(extract_requirements,
                                              (jobs, (deadline, ) * len(jobs),
                                               (len(jobs), ) * len(jobs),
                                               (store, ) * len(jobs)),
                                              workers=workers,
                                              use_processes=False)
            else:
                pool_results = [
                    extract_requirements(args, deadline, store)
                    for args in jobs
                ]

        # filter out exceptions
        results = []
        for i, res in enumerate(pool_results):
            if not isinstance(res, Exception):
                for r in res:
                    results.append(r)

        # insert new data
        for pkg in sorted(results,
                          key=lambda pkg:
                          (pkg.name, pkg.version, sort_key_pyver(pkg.py_ver))):
            py_ver = ''.join(filter(lambda c: c.isdigit(), pkg.py_ver))
            if pkg.error:
                target = error_dict
            else:
                target = pkgs_dict
            insert(py_ver,
                   pkg.name,
                   pkg.version,
                   pkg_to_dict(pkg),
                   target,
                   error=pkg.error)

        # compress and save
        with Measure("compressing data"):
            compress(pkgs_dict.by_bucket(bucket))
            compress(error_dict.by_bucket(bucket))
        print("finished compressing data")
        with Measure("saving data"):
            pkgs_dict.save()
            error_dict.save()

        # collect garbage if free space < MIN_FREE_GB
        if shutil.disk_usage(store or "/nix/store").free / (1000**
                                                            3) < min_free_gb:
            with Measure("collecting nix store garbage"):
                sp.run(
                    f"nix-collect-garbage {f'--store {store}' if store else ''}",
                    capture_output=True,
                    shell=True)
                garbage_collected = True

        # stop execution if deadline occurred
        if deadline_total and time() > deadline_total:
            print(
                f"Deadline occurred. Stopping execution. Last Bucket was {bucket}"
            )
            break