def crawl_pkgs_meta(packages, target_dir, workers): pkgs_dict = LazyBucketDict(target_dir) args_list = [(name, pkgs_dict) for name in packages] if workers > 1: utils.parallel(save_pkg_meta, zip(*args_list), workers=workers) else: [save_pkg_meta(*args) for args in args_list] pkgs_dict.save()
def main(): dump_dir = sys.argv[1] for bucket_key, key_set in get_names_per_bucket().items(): pkgs_dict = LazyBucketDict(f"{dump_dir}", restrict_to_bucket=bucket_key) pkgs = P.select( P.id, P.name, P.version, P.py_ver, P.install_requires, P.setup_requires, P.extras_require, P.tests_require, P.python_requires, ).where(P.error.is_null(), P.name.in_(key_set)) print(f'dumping bucket {bucket_key}') for pkg in sorted(pkgs, key=lambda pkg: (pkg.name, pkg.version, pkg.py_ver)): py_ver = ''.join(filter(lambda c: c.isdigit(), pkg.py_ver)) insert(py_ver, pkg.name, pkg.version, pkg_to_dict(pkg), pkgs_dict) compress(pkgs_dict) pkgs_dict.save()
def main(): dump_dir = sys.argv[1] workers = int(os.environ.get('WORKERS', "1")) pypi_fetcher_dir = os.environ.get('pypi_fetcher') print(f'Index directory: {pypi_fetcher_dir}') assert isdir(pypi_fetcher_dir) for bucket in LazyBucketDict.bucket_keys(): pypi_dict = LazyBucketDict(f"{pypi_fetcher_dir}/pypi") dump_dict = LazyBucketDict(dump_dir, restrict_to_bucket=bucket) print(f"Prune bucket {bucket}") prune_entries(bucket, pypi_dict, dump_dict) pypi_dict.save() dump_dict.save() print(f"Calculating jobs for bucket {bucket}") jobs = list(get_jobs(bucket, pypi_dict, dump_dict)) if not jobs: continue print(f"Starting batch with {len(jobs)} jobs") func = mine_wheel_metadata_full_download if workers > 1: def f(job): return exec_or_return_exc(func, job) result = parallel(f, (jobs, ), workers=workers) else: result = [exec_or_return_exc(func, job) for job in jobs] for r in result: if isinstance(r, Exception): continue name = r.job.name ver = r.job.ver pyver = r.job.pyver fn = r.job.filename if name not in dump_dict: dump_dict[name] = {} if pyver not in dump_dict[name]: dump_dict[name][pyver] = {} if ver not in dump_dict[name][pyver]: dump_dict[name][pyver][ver] = {} dump_dict[name][pyver][ver][fn] = {} for key in ('requires_dist', 'provides_extras', 'requires_external', 'requires_python'): val = getattr(r, key) if val: dump_dict[name][pyver][ver][fn][key] = val compress(dump_dict) dump_dict.save()
def main(): # settings related to performance/parallelization amount_buckets = int(os.environ.get('AMOUNT_BUCKETS', "256")) limit_names = set( filter(lambda n: bool(n), os.environ.get('LIMIT_NAMES', "").split(','))) max_minutes = int(os.environ.get('MAX_MINUTES', "0")) bucket_jobs = int(os.environ.get('BUCKET_JOBS', "0")) start_bucket = int(os.environ.get('BUCKET_START', "0")) workers = int(os.environ.get('WORKERS', multiprocessing.cpu_count() * 2)) # general settings dump_dir = os.environ.get('DUMP_DIR', "./sdist") extractor_src = os.environ.get("EXTRACTOR_SRC") if not extractor_src: raise Exception( "Set env variable 'EXTRACTOR_SRC to {mach-nix}/lib/extractor'") min_free_gb = int(os.environ.get('MIN_FREE_GB', "0")) py_vers_short = os.environ.get('PYTHON_VERSIONS', "27,36,37,38,39,310").strip().split(',') pypi_fetcher_dir = os.environ.get('PYPI_FETCHER', '/tmp/pypi_fetcher') store = os.environ.get('STORE', None) deadline_total = time() + max_minutes * 60 if max_minutes else None # cache build time deps, otherwise first job will be slow with Measure("ensure build time deps"): build_base(extractor_src, py_vers_short, store=store) garbage_collected = False for idx, bucket in enumerate(LazyBucketDict.bucket_keys()): # calculate per bucket deadline if MAX_MINUTES is used if deadline_total: amount = min(amount_buckets, 256 - start_bucket) deadline = time() + (deadline_total - time()) / amount else: deadline = None if idx < start_bucket or idx >= start_bucket + amount_buckets: continue pkgs_dict = LazyBucketDict(dump_dir, restrict_to_bucket=bucket) pypi_index = LazyBucketDict(f"{pypi_fetcher_dir}/pypi", restrict_to_bucket=bucket) # load error data error_dict = LazyBucketDict(dump_dir + "-errors", restrict_to_bucket=bucket) decompress(error_dict.by_bucket(bucket)) with Measure('Get processed pkgs'): print( f"DB contains {len(list(pkgs_dict.keys()))} pkgs at this time for bucket {bucket}" ) with Measure("decompressing data"): decompress(pkgs_dict.by_bucket(bucket)) # purge data for old python versions and packages which got deleted from pypi with Measure("purging packages"): purge(pypi_index, pkgs_dict, bucket, py_vers_short) with Measure("getting jobs"): jobs = get_jobs(pypi_index, error_dict, pkgs_dict, bucket, py_vers_short, limit_num=bucket_jobs, limit_names=limit_names) if not jobs: continue compute_drvs(jobs, extractor_src, store=store) # ensure that all the build time dependencies are cached before starting, # otherwise jobs might time out if garbage_collected: with Measure("ensure build time deps"): build_base(extractor_src, py_vers_short, store=store) with Measure('executing jobs'): if workers > 1: pool_results = utils.parallel(extract_requirements, (jobs, (deadline, ) * len(jobs), (len(jobs), ) * len(jobs), (store, ) * len(jobs)), workers=workers, use_processes=False) else: pool_results = [ extract_requirements(args, deadline, store) for args in jobs ] # filter out exceptions results = [] for i, res in enumerate(pool_results): if not isinstance(res, Exception): for r in res: results.append(r) # insert new data for pkg in sorted(results, key=lambda pkg: (pkg.name, pkg.version, sort_key_pyver(pkg.py_ver))): py_ver = ''.join(filter(lambda c: c.isdigit(), pkg.py_ver)) if pkg.error: target = error_dict else: target = pkgs_dict insert(py_ver, pkg.name, pkg.version, pkg_to_dict(pkg), target, error=pkg.error) # compress and save with Measure("compressing data"): compress(pkgs_dict.by_bucket(bucket)) compress(error_dict.by_bucket(bucket)) print("finished compressing data") with Measure("saving data"): pkgs_dict.save() error_dict.save() # collect garbage if free space < MIN_FREE_GB if shutil.disk_usage(store or "/nix/store").free / (1000** 3) < min_free_gb: with Measure("collecting nix store garbage"): sp.run( f"nix-collect-garbage {f'--store {store}' if store else ''}", capture_output=True, shell=True) garbage_collected = True # stop execution if deadline occurred if deadline_total and time() > deadline_total: print( f"Deadline occurred. Stopping execution. Last Bucket was {bucket}" ) break