Beispiel #1
0
def get_jobs(pypi_fetcher_dir, bucket, processed, amount=1000):
    pypi_dict = LazyBucketDict(f"{pypi_fetcher_dir}/pypi",
                               restrict_to_bucket=bucket)
    jobs = []
    names = list(pypi_dict.by_bucket(bucket).keys())
    total_nr = 0
    for pkg_name in names:
        for ver, release_types in pypi_dict[pkg_name].items():
            if 'sdist' not in release_types:
                continue
            if (pkg_name, ver) in processed:
                continue
            total_nr += 1
            release = release_types['sdist']
            if len(jobs) <= amount:
                jobs.append(
                    PackageJob(
                        bucket,
                        pkg_name,
                        ver,
                        f"https://files.pythonhosted.org/packages/source/{pkg_name[0]}/{pkg_name}/{release[1]}",
                        release[0],
                        0,
                    ))
    shuffle(jobs)
    for i, job in enumerate(jobs):
        job.idx = i
    print(
        f"Bucket {bucket}: Planning execution of {len(jobs)} jobs out of {total_nr} total jobs for this bucket"
    )
    return jobs
Beispiel #2
0
def crawl_pkgs_meta(packages, target_dir, workers):
    pkgs_dict = LazyBucketDict(target_dir)
    args_list = [(name, pkgs_dict) for name in packages]
    if workers > 1:
        utils.parallel(save_pkg_meta, zip(*args_list), workers=workers)
    else:
        [save_pkg_meta(*args) for args in args_list]
    pkgs_dict.save()
Beispiel #3
0
def names_in_buckets():
    in_buckets = {}
    for name in all_packages():
        bucket = LazyBucketDict.bucket(name.replace('_', '-').lower())
        if bucket not in in_buckets:
            in_buckets[bucket] = []
        in_buckets[bucket].append(name)
    return in_buckets
Beispiel #4
0
def get_names_per_bucket() -> Dict[str, Set[str]]:
    result = {}
    hexdigits = "0123456789abcdef"
    for a in hexdigits:
        for b in hexdigits:
            result[a + b] = set()
    keys = [p.name for p in P.select(P.name).distinct()]
    for key in keys:
        result[LazyBucketDict.bucket(key)].add(key)
    return result
Beispiel #5
0
def main():
    dump_dir = sys.argv[1]
    for bucket_key, key_set in get_names_per_bucket().items():
        pkgs_dict = LazyBucketDict(f"{dump_dir}", restrict_to_bucket=bucket_key)
        pkgs = P.select(
            P.id,
            P.name,
            P.version,
            P.py_ver,
            P.install_requires,
            P.setup_requires,
            P.extras_require,
            P.tests_require,
            P.python_requires,
        ).where(P.error.is_null(), P.name.in_(key_set))
        print(f'dumping bucket {bucket_key}')
        for pkg in sorted(pkgs, key=lambda pkg: (pkg.name, pkg.version, pkg.py_ver)):
            py_ver = ''.join(filter(lambda c: c.isdigit(), pkg.py_ver))
            insert(py_ver, pkg.name, pkg.version, pkg_to_dict(pkg), pkgs_dict)
        compress(pkgs_dict)
        pkgs_dict.save()
def purge(pypi_index, pkgs_dict: LazyBucketDict, bucket, py_vers):
    # purge all versions which are not on pypi anymore
    for name, vers in pkgs_dict.by_bucket(bucket).copy().items():
        if name not in pypi_index:
            print(
                f"deleting package {name} from DB because it has been removed from pypi"
            )
            del pkgs_dict[name]
            continue
        for ver in tuple(vers.keys()):
            if ver not in pypi_index[name]:
                print(
                    f"deleting package {name} version {ver} from DB because it has been removed from pypi"
                )
                del pkgs_dict[name][ver]
    # purge old python versions
    for name, vers in pkgs_dict.by_bucket(bucket).copy().items():
        for ver, pyvers in vers.copy().items():
            for pyver in tuple(pyvers.keys()):
                if pyver not in py_vers:
                    print(
                        f"deleting package {name} version {ver} for python {pyver}"
                        f" from DB because we dropped support for this python version"
                    )
                    del pkgs_dict[name][ver][pyver]
            if len(pkgs_dict[name][ver]) == 0:
                print(
                    f"deleting package {name} version {ver} from DB"
                    f" because it is not compatible with any of our supported python versions"
                )
                del pkgs_dict[name][ver]
        if len(pkgs_dict[name]) == 0:
            print(
                f"deleting package {name} from DB"
                f" because it has no releases left which are compatible with any of our supported python versions"
            )
            del pkgs_dict[name]
Beispiel #7
0
def get_jobs(bucket, pypi_dict: LazyBucketDict, dump_dict: LazyBucketDict):
    names = list(pypi_dict.by_bucket(bucket).keys())
    jobs = []
    for pkg_name in names:
        for ver, release_types in pypi_dict[pkg_name].items():
            if 'wheels' not in release_types:
                continue
            for filename, data in release_types['wheels'].items():
                pyver = data[1]
                if is_done(dump_dict, pkg_name, ver, pyver, filename):
                    continue
                url = construct_url(pkg_name, pyver, filename)
                jobs.append(
                    dict(name=pkg_name,
                         ver=ver,
                         filename=filename,
                         pyver=pyver,
                         url=url,
                         bucket=bucket))
    shuffle(jobs)
    return [Job(**j, nr=idx) for idx, j in enumerate(jobs)]
Beispiel #8
0
def main():
    workers = int(os.environ.get('WORKERS', "1"))
    pypi_fetcher_dir = os.environ.get('pypi_fetcher', '/tmp/pypi_fetcher')
    ensure_pypi_fetcher(pypi_fetcher_dir)
    init_db()
    build_base(store=os.environ.get('STORE', None))
    P = Package
    with Measure('Get processed pkgs from DB'):
        processed = set((p.name, p.version)
                        for p in P.select(P.name, P.version).distinct())
        print(f"DB contains {len(processed)} pkgs at this time")
    for bucket in LazyBucketDict.bucket_keys():
        with Measure("getting jobs"):
            jobs = get_jobs(pypi_fetcher_dir, bucket, processed, amount=1000)
            if not jobs:
                continue
        with Measure('batch'):
            if workers > 1:
                pool_results = utils.parallel(extract_requirements, (jobs, ),
                                              workers=workers,
                                              use_processes=False)
            else:
                pool_results = [extract_requirements(args) for args in jobs]
        results = []
        for i, res in enumerate(pool_results):
            if isinstance(res, Exception):
                print(f"Problem with {jobs[i].name}:{jobs[i].version}")
                if isinstance(res, sp.CalledProcessError):
                    print(res.stderr)
                traceback.print_exception(res, res, res.__traceback__)
            else:
                for r in res:
                    results.append(r)
        sleep(1)
        with db.atomic():
            with Measure('bulk insert'):
                Package.bulk_create([Package(**r) for r in results])
        if os.environ.get('CLEANUP', None):
            cleanup()
Beispiel #9
0
def main():
    dump_dir = sys.argv[1]
    workers = int(os.environ.get('WORKERS', "1"))
    pypi_fetcher_dir = os.environ.get('pypi_fetcher')
    print(f'Index directory: {pypi_fetcher_dir}')
    assert isdir(pypi_fetcher_dir)
    for bucket in LazyBucketDict.bucket_keys():
        pypi_dict = LazyBucketDict(f"{pypi_fetcher_dir}/pypi")
        dump_dict = LazyBucketDict(dump_dir, restrict_to_bucket=bucket)
        print(f"Prune bucket {bucket}")
        prune_entries(bucket, pypi_dict, dump_dict)
        pypi_dict.save()
        dump_dict.save()
        print(f"Calculating jobs for bucket {bucket}")
        jobs = list(get_jobs(bucket, pypi_dict, dump_dict))
        if not jobs:
            continue
        print(f"Starting batch with {len(jobs)} jobs")
        func = mine_wheel_metadata_full_download
        if workers > 1:

            def f(job):
                return exec_or_return_exc(func, job)

            result = parallel(f, (jobs, ), workers=workers)
        else:
            result = [exec_or_return_exc(func, job) for job in jobs]
        for r in result:
            if isinstance(r, Exception):
                continue
            name = r.job.name
            ver = r.job.ver
            pyver = r.job.pyver
            fn = r.job.filename
            if name not in dump_dict:
                dump_dict[name] = {}
            if pyver not in dump_dict[name]:
                dump_dict[name][pyver] = {}
            if ver not in dump_dict[name][pyver]:
                dump_dict[name][pyver][ver] = {}
            dump_dict[name][pyver][ver][fn] = {}
            for key in ('requires_dist', 'provides_extras',
                        'requires_external', 'requires_python'):
                val = getattr(r, key)
                if val:
                    dump_dict[name][pyver][ver][fn][key] = val
        compress(dump_dict)
        dump_dict.save()
def main():
    # settings related to performance/parallelization
    amount_buckets = int(os.environ.get('AMOUNT_BUCKETS', "256"))
    limit_names = set(
        filter(lambda n: bool(n),
               os.environ.get('LIMIT_NAMES', "").split(',')))
    max_minutes = int(os.environ.get('MAX_MINUTES', "0"))
    bucket_jobs = int(os.environ.get('BUCKET_JOBS', "0"))
    start_bucket = int(os.environ.get('BUCKET_START', "0"))
    workers = int(os.environ.get('WORKERS', multiprocessing.cpu_count() * 2))

    # general settings
    dump_dir = os.environ.get('DUMP_DIR', "./sdist")
    extractor_src = os.environ.get("EXTRACTOR_SRC")
    if not extractor_src:
        raise Exception(
            "Set env variable 'EXTRACTOR_SRC to {mach-nix}/lib/extractor'")
    min_free_gb = int(os.environ.get('MIN_FREE_GB', "0"))
    py_vers_short = os.environ.get('PYTHON_VERSIONS',
                                   "27,36,37,38,39,310").strip().split(',')
    pypi_fetcher_dir = os.environ.get('PYPI_FETCHER', '/tmp/pypi_fetcher')
    store = os.environ.get('STORE', None)

    deadline_total = time() + max_minutes * 60 if max_minutes else None

    # cache build time deps, otherwise first job will be slow
    with Measure("ensure build time deps"):
        build_base(extractor_src, py_vers_short, store=store)

    garbage_collected = False

    for idx, bucket in enumerate(LazyBucketDict.bucket_keys()):
        # calculate per bucket deadline if MAX_MINUTES is used
        if deadline_total:
            amount = min(amount_buckets, 256 - start_bucket)
            deadline = time() + (deadline_total - time()) / amount
        else:
            deadline = None
        if idx < start_bucket or idx >= start_bucket + amount_buckets:
            continue
        pkgs_dict = LazyBucketDict(dump_dir, restrict_to_bucket=bucket)
        pypi_index = LazyBucketDict(f"{pypi_fetcher_dir}/pypi",
                                    restrict_to_bucket=bucket)
        # load error data
        error_dict = LazyBucketDict(dump_dir + "-errors",
                                    restrict_to_bucket=bucket)
        decompress(error_dict.by_bucket(bucket))
        with Measure('Get processed pkgs'):
            print(
                f"DB contains {len(list(pkgs_dict.keys()))} pkgs at this time for bucket {bucket}"
            )
        with Measure("decompressing data"):
            decompress(pkgs_dict.by_bucket(bucket))
        # purge data for old python versions and packages which got deleted from pypi
        with Measure("purging packages"):
            purge(pypi_index, pkgs_dict, bucket, py_vers_short)
        with Measure("getting jobs"):
            jobs = get_jobs(pypi_index,
                            error_dict,
                            pkgs_dict,
                            bucket,
                            py_vers_short,
                            limit_num=bucket_jobs,
                            limit_names=limit_names)
            if not jobs:
                continue
            compute_drvs(jobs, extractor_src, store=store)

        # ensure that all the build time dependencies are cached before starting,
        # otherwise jobs might time out
        if garbage_collected:
            with Measure("ensure build time deps"):
                build_base(extractor_src, py_vers_short, store=store)
        with Measure('executing jobs'):
            if workers > 1:
                pool_results = utils.parallel(extract_requirements,
                                              (jobs, (deadline, ) * len(jobs),
                                               (len(jobs), ) * len(jobs),
                                               (store, ) * len(jobs)),
                                              workers=workers,
                                              use_processes=False)
            else:
                pool_results = [
                    extract_requirements(args, deadline, store)
                    for args in jobs
                ]

        # filter out exceptions
        results = []
        for i, res in enumerate(pool_results):
            if not isinstance(res, Exception):
                for r in res:
                    results.append(r)

        # insert new data
        for pkg in sorted(results,
                          key=lambda pkg:
                          (pkg.name, pkg.version, sort_key_pyver(pkg.py_ver))):
            py_ver = ''.join(filter(lambda c: c.isdigit(), pkg.py_ver))
            if pkg.error:
                target = error_dict
            else:
                target = pkgs_dict
            insert(py_ver,
                   pkg.name,
                   pkg.version,
                   pkg_to_dict(pkg),
                   target,
                   error=pkg.error)

        # compress and save
        with Measure("compressing data"):
            compress(pkgs_dict.by_bucket(bucket))
            compress(error_dict.by_bucket(bucket))
        print("finished compressing data")
        with Measure("saving data"):
            pkgs_dict.save()
            error_dict.save()

        # collect garbage if free space < MIN_FREE_GB
        if shutil.disk_usage(store or "/nix/store").free / (1000**
                                                            3) < min_free_gb:
            with Measure("collecting nix store garbage"):
                sp.run(
                    f"nix-collect-garbage {f'--store {store}' if store else ''}",
                    capture_output=True,
                    shell=True)
                garbage_collected = True

        # stop execution if deadline occurred
        if deadline_total and time() > deadline_total:
            print(
                f"Deadline occurred. Stopping execution. Last Bucket was {bucket}"
            )
            break
def decompress(pkgs_dict: LazyBucketDict):
    for name, vers in pkgs_dict.items():
        decompress_dict(vers)
        for ver, pyvers in vers.items():
            decompress_dict(pyvers)