def compute_memory_per_worker( n_workers: int = 1, mem_safety_margin: Optional[Union[str, int]] = None, memory_limit: Optional[Union[str, int]] = None) -> int: """ Figure out how much memory to assign per worker. result can be passed into ``memory_limit=`` parameter of dask worker/cluster/client """ from dask.utils import parse_bytes if isinstance(memory_limit, str): memory_limit = parse_bytes(memory_limit) if isinstance(mem_safety_margin, str): mem_safety_margin = parse_bytes(mem_safety_margin) if memory_limit is None and mem_safety_margin is None: total_bytes = get_total_available_memory() # leave 500Mb or half of all memory if RAM is less than 1 Gb mem_safety_margin = min(500 * (1024 * 1024), total_bytes // 2) elif memory_limit is None: total_bytes = get_total_available_memory() elif mem_safety_margin is None: total_bytes = memory_limit mem_safety_margin = 0 else: total_bytes = memory_limit return (total_bytes - mem_safety_margin) // n_workers
def test_parse_bytes(): assert parse_bytes("100") == 100 assert parse_bytes("100 MB") == 100000000 assert parse_bytes("100M") == 100000000 assert parse_bytes("5kB") == 5000 assert parse_bytes("5.4 kB") == 5400 assert parse_bytes("1kiB") == 1024 assert parse_bytes("1Mi") == 2 ** 20 assert parse_bytes("1e6") == 1000000 assert parse_bytes("1e6 kB") == 1000000000 assert parse_bytes("MB") == 1000000
def test_parse_bytes(): assert parse_bytes("100") == 100 assert parse_bytes("100 MB") == 100000000 assert parse_bytes("100M") == 100000000 assert parse_bytes("5kB") == 5000 assert parse_bytes("5.4 kB") == 5400 assert parse_bytes("1kiB") == 1024 assert parse_bytes("1Mi") == 2**20 assert parse_bytes("1e6") == 1000000 assert parse_bytes("1e6 kB") == 1000000000 assert parse_bytes("MB") == 1000000
def test_chunksize(tmpdir, chunksize, metadata): nparts = 2 df_size = 100 row_group_size = 5 row_group_byte_size = 451 # Empirically measured df = pd.DataFrame({ "a": np.random.choice(["apple", "banana", "carrot"], size=df_size), "b": np.random.random(size=df_size), "c": np.random.randint(1, 5, size=df_size), "index": np.arange(0, df_size), }).set_index("index") ddf1 = dd.from_pandas(df, npartitions=nparts) ddf1.to_parquet( str(tmpdir), engine="pyarrow", row_group_size=row_group_size, write_metadata_file=metadata, ) if metadata: path = str(tmpdir) else: dirname = str(tmpdir) files = os.listdir(dirname) assert "_metadata" not in files path = os.path.join(dirname, "*.parquet") ddf2 = dask_cudf.read_parquet( path, chunksize=chunksize, split_row_groups=True, gather_statistics=True, index="index", ) assert_eq(ddf1, ddf2, check_divisions=False) num_row_groups = df_size // row_group_size if not chunksize: assert ddf2.npartitions == num_row_groups else: # Check that we are really aggregating df_byte_size = row_group_byte_size * num_row_groups expected = df_byte_size // parse_bytes(chunksize) remainder = (df_byte_size % parse_bytes(chunksize)) > 0 expected += int(remainder) * nparts assert ddf2.npartitions == max(nparts, expected)
def read_csv(path, chunksize="128 MiB", **kwargs): if isinstance(chunksize, str): chunksize = parse_bytes(chunksize) filenames = sorted(glob(str(path))) # TODO: lots of complexity name = "read-csv-" + tokenize(path, tokenize, ** kwargs) # TODO: get last modified time meta = cudf.read_csv(filenames[0], **kwargs) dsk = {} i = 0 for fn in filenames: size = os.path.getsize(fn) for start in range(0, size, chunksize): kwargs2 = kwargs.copy() kwargs2["byte_range"] = ( start, chunksize, ) # specify which chunk of the file we care about if start != 0: kwargs2[ "names"] = meta.columns # no header in the middle of the file kwargs2["header"] = None dsk[(name, i)] = (apply, cudf.read_csv, [fn], kwargs2) i += 1 divisions = [None] * (len(dsk) + 1) return dd.core.new_dd_object(dsk, name, meta, divisions)
def scale(self, n=0, memory=None, cores=None): if memory is not None: n = max( n, int(math.ceil(parse_bytes(memory) / self._memory_per_worker()))) if cores is not None: n = max(n, int(math.ceil(cores / self._threads_per_worker()))) if len(self.worker_spec) > n: not_yet_launched = set(self.worker_spec) - { v["name"] for v in self.scheduler_info["workers"].values() } while len(self.worker_spec) > n and not_yet_launched: del self.worker_spec[not_yet_launched.pop()] while len(self.worker_spec) > n: self.worker_spec.popitem() if self.status not in (Status.closing, Status.closed): while len(self.worker_spec) < n: self.worker_spec.update(self.new_worker_spec()) self.loop.add_callback(self._correct_state) if self.asynchronous: return NoOpAwaitable()
def test_job_script(tmpdir): log_directory = tmpdir.strpath with SGECluster( cores=6, processes=2, memory="12GB", queue="my-queue", project="my-project", walltime="02:00:00", env_extra=["export MY_VAR=my_var"], job_extra=["-w e", "-m e"], log_directory=log_directory, resource_spec="h_vmem=12G,mem_req=12G", ) as cluster: job_script = cluster.job_script() formatted_bytes = format_bytes(parse_bytes("6GB")).replace(" ", "") for each in [ "--nprocs 2", "--nthreads 3", f"--memory-limit {formatted_bytes}", "-q my-queue", "-P my-project", "-l h_rt=02:00:00", "export MY_VAR=my_var", "#$ -w e", "#$ -m e", "#$ -e {}".format(log_directory), "#$ -o {}".format(log_directory), "-l h_vmem=12G,mem_req=12G", "#$ -cwd", "#$ -j y", ]: assert each in job_script
def disk(clear=False, size_limit=disk_size_limit, eviction_policy='least-recently-stored'): '''Stored cached values using the diskcache library. The path to store the cache is: ~/.vaex/cache/diskcache :param int or str size_limit: Max size of cache in bytes (or use a string like '128MB') See http://www.grantjenks.com/docs/diskcache/tutorial.html?highlight=eviction#tutorial-settings for more details. :param str eviction_policy: Eviction policy, See http://www.grantjenks.com/docs/diskcache/tutorial.html?highlight=eviction#tutorial-eviction-policies :param bool clear: Remove all disk space used for caching before turning on cache. ''' from dask.utils import parse_bytes size_limit = parse_bytes(size_limit) global cache old_cache = cache path = vaex.utils.get_private_dir('cache/diskcache') if clear: try: log.debug(f"Clearing disk cache: {path}") shutil.rmtree(path) except OSError: # Windows wonkiness log.exception(f"Error clearing disk cache: {path}") log.debug(f"Initializing disk cache: {path}") cache = diskcache.Cache(path, size_limit=size_limit, eviction_policy=eviction_policy) yield log.debug("Restored old cache") cache = old_cache
def worker_count_info(client, gpu_sizes=["16GB", "32GB", "40GB"], tol="2.1GB"): """ Method accepts the Client object, GPU sizes and tolerance limit and returns a dictionary containing number of workers per GPU size specified """ counts_by_gpu_size = dict.fromkeys(gpu_sizes, 0) worker_info = client.scheduler_info()["workers"] for worker, info in worker_info.items(): # Assumption is that a node is homogeneous (on a specific node all gpus have the same size) worker_device_memory = info["gpu"]["memory-total"][0] for gpu_size in gpu_sizes: if abs(parse_bytes(gpu_size) - worker_device_memory) < parse_bytes(tol): counts_by_gpu_size[gpu_size] += 1 break return counts_by_gpu_size
def test_batch_files_single(self): # Arrange instrument_ids = self.catalog.instruments()["id"].unique().tolist() base = BacktestDataConfig( catalog_path=str(self.catalog.path), catalog_fs_protocol=self.catalog.fs.protocol, data_cls_path="nautilus_trader.model.orderbook.data.OrderBookData", ) iter_batches = batch_files( catalog=self.catalog, data_configs=[ base.replace(instrument_id=instrument_ids[0]), base.replace(instrument_id=instrument_ids[1]), ], target_batch_size_bytes=parse_bytes("10kib"), read_num_rows=300, ) # Act timestamp_chunks = [] for batch in iter_batches: timestamp_chunks.append([b.ts_init for b in batch]) # Assert latest_timestamp = 0 for timestamps in timestamp_chunks: assert max(timestamps) > latest_timestamp latest_timestamp = max(timestamps) assert timestamps == sorted(timestamps)
def disk(clear=False, size_limit=vaex.settings.cache.disk_size_limit, eviction_policy="least-recently-stored"): """Stored cached values using the diskcache library. See configuration details at `configuration of cache <conf.html#disk-size-limit>`_. and `configuration of paths <conf.html#cache-compute>`_ :param int or str size_limit: Max size of cache in bytes (or use a string like '128MB') See http://www.grantjenks.com/docs/diskcache/tutorial.html?highlight=eviction#tutorial-settings for more details. :param str eviction_policy: Eviction policy, See http://www.grantjenks.com/docs/diskcache/tutorial.html?highlight=eviction#tutorial-eviction-policies :param bool clear: Remove all disk space used for caching before turning on cache. """ from dask.utils import parse_bytes size_limit = parse_bytes(size_limit) global cache old_cache = cache path = vaex.settings.cache.path if clear: try: log.debug(f"Clearing disk cache: {path}") shutil.rmtree(path) except OSError: # Windows wonkiness log.exception(f"Error clearing disk cache: {path}") log.debug(f"Initializing disk cache: {path}") cache = diskcache.Cache(path, size_limit=size_limit, eviction_policy=eviction_policy) yield log.debug("Restored old cache") cache = old_cache
def test_batch_generic_data(self): # Arrange TestStubs.setup_news_event_persistence() process_files( glob_path=f"{PACKAGE_ROOT}/data/news_events.csv", reader=CSVReader(block_parser=TestStubs.news_event_parser), catalog=self.catalog, ) data_config = BacktestDataConfig( catalog_path="/root/", catalog_fs_protocol="memory", data_cls_path=f"{NewsEventData.__module__}.NewsEventData", client_id="NewsClient", ) # Add some arbitrary instrument data to appease BacktestEngine instrument_data_config = BacktestDataConfig( catalog_path="/root/", catalog_fs_protocol="memory", instrument_id=self.catalog.instruments(as_nautilus=True)[0].id.value, data_cls_path=f"{InstrumentStatusUpdate.__module__}.InstrumentStatusUpdate", ) run_config = BacktestRunConfig( data=[data_config, instrument_data_config], persistence=BetfairTestStubs.persistence_config(catalog_path=self.catalog.path), venues=[BetfairTestStubs.betfair_venue_config()], strategies=[], batch_size_bytes=parse_bytes("1mib"), ) # Act node = BacktestNode() node.run_sync([run_config]) # Assert assert node
def test_job_script(): with OARCluster(walltime="00:02:00", processes=4, cores=8, memory="28GB") as cluster: job_script = cluster.job_script() assert "#OAR" in job_script assert "#OAR -n dask-worker" in job_script formatted_bytes = format_bytes(parse_bytes("7GB")).replace(" ", "") assert f"--memory-limit {formatted_bytes}" in job_script assert "#OAR -l /nodes=1/core=8,walltime=00:02:00" in job_script assert "#OAR --project" not in job_script assert "#OAR -q" not in job_script assert "export " not in job_script assert ("{} -m distributed.cli.dask_worker tcp://".format( sys.executable) in job_script) formatted_bytes = format_bytes(parse_bytes("7GB")).replace(" ", "") assert f"--nthreads 2 --nprocs 4 --memory-limit {formatted_bytes}" in job_script with OARCluster( walltime="00:02:00", processes=4, cores=8, memory="28GB", env_extra=[ 'export LANG="en_US.utf8"', 'export LANGUAGE="en_US.utf8"', 'export LC_ALL="en_US.utf8"', ], ) as cluster: job_script = cluster.job_script() assert "#OAR" in job_script assert "#OAR -n dask-worker" in job_script formatted_bytes = format_bytes(parse_bytes("7GB")).replace(" ", "") assert f"--memory-limit {formatted_bytes}" in job_script assert "#OAR -l /nodes=1/core=8,walltime=00:02:00" in job_script assert "#OAR --project" not in job_script assert "#OAR -q" not in job_script assert 'export LANG="en_US.utf8"' in job_script assert 'export LANGUAGE="en_US.utf8"' in job_script assert 'export LC_ALL="en_US.utf8"' in job_script assert ("{} -m distributed.cli.dask_worker tcp://".format( sys.executable) in job_script) formatted_bytes = format_bytes(parse_bytes("7GB")).replace(" ", "") assert f"--nthreads 2 --nprocs 4 --memory-limit {formatted_bytes}" in job_script
def make_raw_files(glob_path, block_size="128mb", compression="infer", **kw) -> List[RawFile]: files = scan_files(glob_path, compression=compression, **kw) return [ RawFile(open_file=f, block_size=parse_bytes(block_size)) for f in files ]
def _internal_read_csv(path, chunksize="256 MiB", **kwargs): if isinstance(chunksize, str): chunksize = parse_bytes(chunksize) if isinstance(path, list): filenames = path elif isinstance(path, str): filenames = sorted(glob(path)) elif hasattr(path, "__fspath__"): filenames = sorted(glob(path.__fspath__())) else: raise TypeError("Path type not understood:{}".format(type(path))) if not filenames: msg = f"A file in: {filenames} does not exist." raise FileNotFoundError(msg) name = "read-csv-" + tokenize(path, tokenize, ** kwargs) # TODO: get last modified time compression = kwargs.get("compression", False) if compression and chunksize: # compressed CSVs reading must read the entire file kwargs.pop("byte_range", None) warn("Warning %s compression does not support breaking apart files\n" "Please ensure that each individual file can fit in memory and\n" "use the keyword ``chunksize=None to remove this message``\n" "Setting ``chunksize=(size of file)``" % compression) chunksize = None if chunksize is None: return read_csv_without_chunksize(path, **kwargs) dask_reader = make_reader(cudf.read_csv, "read_csv", "CSV") meta = dask_reader(filenames[0], **kwargs)._meta dsk = {} i = 0 dtypes = meta.dtypes.values for fn in filenames: size = os.path.getsize(fn) for start in range(0, size, chunksize): kwargs2 = kwargs.copy() kwargs2["byte_range"] = ( start, chunksize, ) # specify which chunk of the file we care about if start != 0: kwargs2[ "names"] = meta.columns # no header in the middle of the file kwargs2["header"] = None dsk[(name, i)] = (apply, _read_csv, [fn, dtypes], kwargs2) i += 1 divisions = [None] * (len(dsk) + 1) return dd.core.new_dd_object(dsk, name, meta, divisions)
def batch_files( # noqa: C901 catalog: DataCatalog, data_configs: List[BacktestDataConfig], read_num_rows: int = 10000, target_batch_size_bytes: int = parse_bytes("100mb"), # noqa: B008, ): files = build_filenames(catalog=catalog, data_configs=data_configs) buffer = {fn.filename: pd.DataFrame() for fn in files} datasets = { f.filename: dataset_batches(file_meta=f, fs=catalog.fs, n_rows=read_num_rows) for f in files } completed: Set[str] = set() bytes_read = 0 values = [] sent_count = 0 while set([f.filename for f in files]) != completed: # Fill buffer (if required) for fn in buffer: if len(buffer[fn]) < read_num_rows: next_buf = next(datasets[fn], None) if next_buf is None: completed.add(fn) continue buffer[fn] = pd.concat([buffer[fn], next_buf]) # Determine minimum timestamp max_ts_per_frame = {fn: df["ts_init"].max() for fn, df in buffer.items() if not df.empty} if not max_ts_per_frame: continue min_ts = min(max_ts_per_frame.values()) # Filter buffer dataframes based on min_timestamp batches = [] for f in files: df = buffer[f.filename] if df.empty: continue ts_filter = df["ts_init"] <= min_ts # min of max timestamps batch = df[ts_filter] buffer[f.filename] = df[~ts_filter] objs = frame_to_nautilus(df=batch, cls=f.datatype) batches.append(objs) bytes_read += sum([sys.getsizeof(x) for x in objs]) # Merge ticks values.extend(list(heapq.merge(*batches, key=lambda x: x.ts_init))) if bytes_read > target_batch_size_bytes: yield values sent_count += len(values) bytes_read = 0 values = [] if values: yield values sent_count += len(values) if sent_count == 0: raise ValueError("No data found, check data_configs")
def test_getitem_avoids_large_chunks(): a = np.arange(4 * 500 * 500).reshape(4, 500, 500) arr = da.from_array(a, chunks=(1, 500, 500)) indexer = [0, 1] + [2] * 100 + [3] result = arr[indexer] chunk_size = utils.parse_bytes(config.get("array.chunk-size")) assert all(x.nbytes < chunk_size for x in result.blocks) expected = a[indexer] assert_eq(result, expected)
def batch_files( catalog: DataCatalog, data_configs: List[BacktestDataConfig], read_num_rows: int = 10000, target_batch_size_bytes: int = parse_bytes("100mb"), # noqa: B008 ): files = build_filenames(catalog=catalog, data_configs=data_configs) buffer = {fn.filename: pd.DataFrame() for fn in files} datasets = { f.filename: dataset_batches(file_meta=f, fs=catalog.fs, n_rows=read_num_rows) for f in files } completed: Set[str] = set() bytes_read = 0 values = [] while set([f.filename for f in files]) != completed: # Fill buffer (if required) for fn in buffer: if len(buffer[fn]) < read_num_rows: next_buf = next(datasets[fn], None) if next_buf is None: completed.add(fn) continue buffer[fn] = buffer[fn].append(next_buf) # Determine minimum timestamp max_ts_per_frame = [df["ts_init"].max() for df in buffer.values() if not df.empty] if not max_ts_per_frame: continue min_ts = min(max_ts_per_frame) # Filter buffer dataframes based on min_timestamp batches = [] for f in files: df = buffer[f.filename] if df.empty: continue ts_filter = df["ts_init"] <= min_ts batch = df[ts_filter] buffer[f.filename] = df[~ts_filter] # print(f"{f.filename} batch={len(batch)} buffer={len(buffer)}") objs = frame_to_nautilus(df=batch, cls=f.datatype) batches.append(objs) bytes_read += sum([sys.getsizeof(x) for x in objs]) # Merge ticks values.extend(list(heapq.merge(*batches, key=lambda x: x.ts_init))) # print(f"iter complete, {bytes_read=}, flushing at target={target_batch_size_bytes}") if bytes_read > target_batch_size_bytes: yield values bytes_read = 0 values = [] if values: yield values
def test_backtest_run_streaming_sync(self): # Arrange node = BacktestNode() base = self.backtest_configs[0] config = base.replace(strategies=self.strategies, batch_size_bytes=parse_bytes("10kib")) # Act results = node.run_sync([config]) # Assert assert len(results) == 1
def worker_count_info(client): """ Method accepts the Client object and returns a dictionary containing number of workers per GPU size specified Assumes all GPUs are of the same type. """ gpu_sizes = ["16GB", "32GB", "40GB"] counts_by_gpu_size = dict.fromkeys(gpu_sizes, 0) tolerance = "2.6GB" worker_info = client.scheduler_info()["workers"] for worker, info in worker_info.items(): worker_device_memory = info["gpu"]["memory-total"] for gpu_size in gpu_sizes: if abs(parse_bytes(gpu_size) - worker_device_memory) < parse_bytes(tolerance): counts_by_gpu_size[gpu_size] += 1 break return counts_by_gpu_size
def test_job_script(Cluster): with Cluster(walltime="00:02:00", processes=4, cores=8, memory="28GB") as cluster: job_script = cluster.job_script() assert "#PBS" in job_script assert "#PBS -N dask-worker" in job_script assert "#PBS -l select=1:ncpus=8:mem=27GB" in job_script assert "#PBS -l walltime=00:02:00" in job_script assert "#PBS -q" not in job_script assert "#PBS -A" not in job_script assert ("{} -m distributed.cli.dask_worker tcp://".format( sys.executable) in job_script) formatted_bytes = format_bytes(parse_bytes("7GB")).replace(" ", "") assert f"--nthreads 2 --nprocs 4 --memory-limit {formatted_bytes}" in job_script with Cluster( queue="regular", project="DaskOnPBS", processes=4, cores=8, resource_spec="select=1:ncpus=24:mem=100GB", memory="28GB", ) as cluster: job_script = cluster.job_script() assert "#PBS -q regular" in job_script assert "#PBS -N dask-worker" in job_script assert "#PBS -l select=1:ncpus=24:mem=100GB" in job_script assert "#PBS -l select=1:ncpus=8:mem=27GB" not in job_script assert "#PBS -l walltime=" in job_script assert "#PBS -A DaskOnPBS" in job_script assert ("{} -m distributed.cli.dask_worker tcp://".format( sys.executable) in job_script) formatted_bytes = format_bytes(parse_bytes("7GB")).replace(" ", "") assert f"--nthreads 2 --nprocs 4 --memory-limit {formatted_bytes}" in job_script
def _memory_per_worker(self) -> int: """Return the memory limit per worker for new workers""" if not self.new_spec: # pragma: no cover raise ValueError( "to scale by memory= your worker definition must include a " "memory_limit definition") for name in ["memory_limit", "memory"]: with suppress(KeyError): return parse_bytes(self.new_spec["options"][name]) raise ValueError( "to use scale(memory=...) your worker definition must include a " "memory_limit definition")
def __init__( self, paths, engine=None, part_size=None, part_mem_fraction=None, storage_options=None, **kwargs, ): if part_size: # If a specific partition size is given, use it directly part_size = parse_bytes(part_size) else: # If a fractional partition size is given, calculate part_size part_mem_fraction = part_mem_fraction or 0.125 assert part_mem_fraction > 0.0 and part_mem_fraction < 1.0 if part_mem_fraction > 0.25: warnings.warn("Using very large partitions sizes for Dask. " "Memory-related errors are likely.") part_size = int(device_mem_size(kind="total") * part_mem_fraction) # Engine-agnostic path handling if hasattr(paths, "name"): paths = stringify_path(paths) if isinstance(paths, str): paths = [paths] storage_options = storage_options or {} # If engine is not provided, try to infer from end of paths[0] if engine is None: engine = paths[0].split(".")[-1] if isinstance(engine, str): if engine == "parquet": self.engine = ParquetDatasetEngine( paths, part_size, storage_options=storage_options, **kwargs) elif engine == "csv": self.engine = CSVDatasetEngine(paths, part_size, storage_options=storage_options, **kwargs) else: raise ValueError("Only parquet and csv supported (for now).") else: self.engine = engine(paths, part_size, storage_options=storage_options)
def lsf_unit_detection_helper(expected_unit, conf_text=None): temp_dir = tempfile.mkdtemp() current_lsf_envdir = os.environ.get("LSF_ENVDIR", None) os.environ["LSF_ENVDIR"] = temp_dir if conf_text is not None: with open(os.path.join(temp_dir, "lsf.conf"), "w") as conf_file: conf_file.write(conf_text) memory_string = "13GB" memory_base = parse_bytes(memory_string) correct_memory = lsf.lsf_format_bytes_ceil(memory_base, lsf_units=expected_unit) with LSFCluster(memory=memory_string, cores=1) as cluster: assert "#BSUB -M %s" % correct_memory in cluster.job_header rmtree(temp_dir) if current_lsf_envdir is None: del os.environ["LSF_ENVDIR"] else: os.environ["LSF_ENVDIR"] = current_lsf_envdir
def _byte_block_counts( urlpath, blocksize, lineterminator=None, compression="infer", storage_options=None, **kwargs, ): """Return a list of paths and block counts. Logic copied from dask.bytes.read_bytes """ if lineterminator is not None and len(lineterminator) == 1: kwargs["lineterminator"] = lineterminator else: lineterminator = "\n" if compression == "infer": paths = get_fs_token_paths(urlpath, mode="rb", storage_options=storage_options)[2] compression = infer_compression(paths[0]) if isinstance(blocksize, str): blocksize = parse_bytes(blocksize) if blocksize and compression: blocksize = None b_out = read_bytes( urlpath, delimiter=lineterminator.encode(), blocksize=blocksize, sample=False, compression=compression, include_path=True, **(storage_options or {}), ) _, values, paths = b_out if not isinstance(values[0], (tuple, list)): values = [values] return paths, [len(v) for v in values]
def __init__( self, path, engine=None, part_size=None, part_mem_fraction=None, storage_options=None, **kwargs, ): if part_size: # If a specific partition size is given, use it directly part_size = parse_bytes(part_size) else: # If a fractional partition size is given, calculate part_size part_mem_fraction = part_mem_fraction or 0.125 assert part_mem_fraction > 0.0 and part_mem_fraction < 1.0 if part_mem_fraction > 0.25: warnings.warn("Using very large partitions sizes for Dask. " "Memory-related errors are likely.") part_size = int(cuda.current_context().get_memory_info()[1] * part_mem_fraction) # Engine-agnostic path handling if hasattr(path, "name"): path = stringify_path(path) storage_options = storage_options or {} fs, fs_token, paths = get_fs_token_paths( path, mode="rb", storage_options=storage_options) paths = sorted(paths, key=natural_sort_key) # If engine is not provided, try to infer from end of paths[0] if engine is None: engine = paths[0].split(".")[-1] if isinstance(engine, str): if engine == "parquet": self.engine = ParquetDatasetEngine(paths, part_size, fs, fs_token, **kwargs) elif engine == "csv": self.engine = CSVDatasetEngine(paths, part_size, fs, fs_token, **kwargs) else: raise ValueError("Only parquet and csv supported (for now).") else: self.engine = engine(paths, part_size, fs, fs_token, **kwargs)
def read_csv(path, chunksize="256 MiB", **kwargs): if isinstance(chunksize, str): chunksize = parse_bytes(chunksize) filenames = sorted(glob(str(path))) # TODO: lots of complexity name = "read-csv-" + tokenize(path, tokenize, ** kwargs) # TODO: get last modified time compression = kwargs.get("compression", False) if compression: # compressed CSVs reading must read the entire file kwargs.pop("byte_range", None) warn("Warning %s compression does not support breaking apart files\n" "Please ensure that each individual file can fit in memory and\n" "use the keyword ``chunksize=None to remove this message``\n" "Setting ``chunksize=(size of file)``" % compression) chunksize = None if chunksize is None: return read_csv_without_chunksize(path, **kwargs) meta = cudf.read_csv(filenames[0], **kwargs) dsk = {} i = 0 dtypes = meta.dtypes.values for fn in filenames: size = os.path.getsize(fn) for start in range(0, size, chunksize): kwargs2 = kwargs.copy() kwargs2["byte_range"] = ( start, chunksize, ) # specify which chunk of the file we care about if start != 0: kwargs2[ "names"] = meta.columns # no header in the middle of the file kwargs2["header"] = None dsk[(name, i)] = (apply, _read_csv, [fn, dtypes], kwargs2) i += 1 divisions = [None] * (len(dsk) + 1) return dd.core.new_dd_object(dsk, name, meta, divisions)
def parse_memory_limit( memory_limit: str | float, nthreads: int, total_cores: int = CPU_COUNT ) -> int | None: if memory_limit is None: return None if memory_limit == "auto": memory_limit = int(system.MEMORY_LIMIT * min(1, nthreads / total_cores)) with suppress(ValueError, TypeError): memory_limit = float(memory_limit) if isinstance(memory_limit, float) and memory_limit <= 1: memory_limit = int(memory_limit * system.MEMORY_LIMIT) if isinstance(memory_limit, str): memory_limit = parse_bytes(memory_limit) else: memory_limit = int(memory_limit) assert isinstance(memory_limit, int) if memory_limit == 0: return None return min(memory_limit, system.MEMORY_LIMIT)
def parse_device_memory_limit(device_memory_limit, device_index=0): """Parse memory limit to be used by a CUDA device. Parameters ---------- device_memory_limit: float, int, str or None This can be a float (fraction of total device memory), an integer (bytes), a string (like 5GB or 5000M), and "auto", 0 or None for the total device size. device_index: int The index of device from which to obtain the total memory amount. Examples -------- >>> # On a 32GB CUDA device >>> parse_device_memory_limit(None) 34089730048 >>> parse_device_memory_limit(0.8) 27271784038 >>> parse_device_memory_limit(1000000000) 1000000000 >>> parse_device_memory_limit("1GB") 1000000000 """ if any(device_memory_limit == v for v in [0, "0", None, "auto"]): return get_device_total_memory(device_index) with suppress(ValueError, TypeError): device_memory_limit = float(device_memory_limit) if isinstance(device_memory_limit, float) and device_memory_limit <= 1: return int( get_device_total_memory(device_index) * device_memory_limit) if isinstance(device_memory_limit, str): return parse_bytes(device_memory_limit) else: return int(device_memory_limit)
def convert_chunk_size(chunk_size, factor, dtype, masked): """Convert a chunk size given as a string to number of elements. Args: chunk_size (None, int, str): Chunk size. Conversion using this function is only required if a string is given - other values are simply returned. factor (int): Number of elements in dimensions which are not chunked, e.g. the number of rows if chunking is done exclusively along columns. This is used to compute the size per chunk. dtype (numpy dtype): Data dtype. masked (bool): If True, an additional byte per element will be taken into account. Returns: int or None: Maximum number of elements to stay below the given `chunk_size`. If `chunk_size` is None, None will be returned. """ if chunk_size is None or isinstance(chunk_size, int): # Do nothing. return chunk_size element_size = (dtype.itemsize + (1 if masked else 0)) * factor return parse_bytes(chunk_size) // element_size
def memory(maxsize=vaex.settings.cache.memory_size_limit, classname="LRUCache", clear=False): """Sets a memory cache using cachetools (https://cachetools.readthedocs.io/). Calling multiple times with clear=False will keep the current cache (useful in notebook usage). :param int or str maxsize: Max size of cache in bytes (or use a string like '128MB') :param str classname: classname in the cachetools library used for the cache (e.g. LRUCache, MRUCache). :param bool clear: If False, will always set a new cache, when true, it will keep the cache when it is of the same type. """ global cache from dask.utils import parse_bytes maxsize = parse_bytes(maxsize) log.debug("set cache to memory (cachetools)") old_cache = cache if isinstance(classname, str): cls = getattr(cachetools, classname) else: cls = classname if clear or type(cache) != cls: cache = cls(maxsize=maxsize) yield log.debug("restore old cache") cache = old_cache