Exemple #1
0
def compute_memory_per_worker(
        n_workers: int = 1,
        mem_safety_margin: Optional[Union[str, int]] = None,
        memory_limit: Optional[Union[str, int]] = None) -> int:
    """ Figure out how much memory to assign per worker.

        result can be passed into ``memory_limit=`` parameter of dask worker/cluster/client
    """
    from dask.utils import parse_bytes

    if isinstance(memory_limit, str):
        memory_limit = parse_bytes(memory_limit)

    if isinstance(mem_safety_margin, str):
        mem_safety_margin = parse_bytes(mem_safety_margin)

    if memory_limit is None and mem_safety_margin is None:
        total_bytes = get_total_available_memory()
        # leave 500Mb or half of all memory if RAM is less than 1 Gb
        mem_safety_margin = min(500 * (1024 * 1024), total_bytes // 2)
    elif memory_limit is None:
        total_bytes = get_total_available_memory()
    elif mem_safety_margin is None:
        total_bytes = memory_limit
        mem_safety_margin = 0
    else:
        total_bytes = memory_limit

    return (total_bytes - mem_safety_margin) // n_workers
Exemple #2
0
def test_parse_bytes():
    assert parse_bytes("100") == 100
    assert parse_bytes("100 MB") == 100000000
    assert parse_bytes("100M") == 100000000
    assert parse_bytes("5kB") == 5000
    assert parse_bytes("5.4 kB") == 5400
    assert parse_bytes("1kiB") == 1024
    assert parse_bytes("1Mi") == 2 ** 20
    assert parse_bytes("1e6") == 1000000
    assert parse_bytes("1e6 kB") == 1000000000
    assert parse_bytes("MB") == 1000000
Exemple #3
0
def test_parse_bytes():
    assert parse_bytes("100") == 100
    assert parse_bytes("100 MB") == 100000000
    assert parse_bytes("100M") == 100000000
    assert parse_bytes("5kB") == 5000
    assert parse_bytes("5.4 kB") == 5400
    assert parse_bytes("1kiB") == 1024
    assert parse_bytes("1Mi") == 2**20
    assert parse_bytes("1e6") == 1000000
    assert parse_bytes("1e6 kB") == 1000000000
    assert parse_bytes("MB") == 1000000
Exemple #4
0
def test_chunksize(tmpdir, chunksize, metadata):
    nparts = 2
    df_size = 100
    row_group_size = 5
    row_group_byte_size = 451  # Empirically measured

    df = pd.DataFrame({
        "a":
        np.random.choice(["apple", "banana", "carrot"], size=df_size),
        "b":
        np.random.random(size=df_size),
        "c":
        np.random.randint(1, 5, size=df_size),
        "index":
        np.arange(0, df_size),
    }).set_index("index")

    ddf1 = dd.from_pandas(df, npartitions=nparts)
    ddf1.to_parquet(
        str(tmpdir),
        engine="pyarrow",
        row_group_size=row_group_size,
        write_metadata_file=metadata,
    )

    if metadata:
        path = str(tmpdir)
    else:
        dirname = str(tmpdir)
        files = os.listdir(dirname)
        assert "_metadata" not in files
        path = os.path.join(dirname, "*.parquet")

    ddf2 = dask_cudf.read_parquet(
        path,
        chunksize=chunksize,
        split_row_groups=True,
        gather_statistics=True,
        index="index",
    )

    assert_eq(ddf1, ddf2, check_divisions=False)

    num_row_groups = df_size // row_group_size
    if not chunksize:
        assert ddf2.npartitions == num_row_groups
    else:
        # Check that we are really aggregating
        df_byte_size = row_group_byte_size * num_row_groups
        expected = df_byte_size // parse_bytes(chunksize)
        remainder = (df_byte_size % parse_bytes(chunksize)) > 0
        expected += int(remainder) * nparts
        assert ddf2.npartitions == max(nparts, expected)
Exemple #5
0
def read_csv(path, chunksize="128 MiB", **kwargs):
    if isinstance(chunksize, str):
        chunksize = parse_bytes(chunksize)
    filenames = sorted(glob(str(path)))  # TODO: lots of complexity
    name = "read-csv-" + tokenize(path, tokenize, **
                                  kwargs)  # TODO: get last modified time

    meta = cudf.read_csv(filenames[0], **kwargs)

    dsk = {}
    i = 0
    for fn in filenames:
        size = os.path.getsize(fn)
        for start in range(0, size, chunksize):
            kwargs2 = kwargs.copy()
            kwargs2["byte_range"] = (
                start,
                chunksize,
            )  # specify which chunk of the file we care about
            if start != 0:
                kwargs2[
                    "names"] = meta.columns  # no header in the middle of the file
                kwargs2["header"] = None
            dsk[(name, i)] = (apply, cudf.read_csv, [fn], kwargs2)
            i += 1

    divisions = [None] * (len(dsk) + 1)
    return dd.core.new_dd_object(dsk, name, meta, divisions)
Exemple #6
0
    def scale(self, n=0, memory=None, cores=None):
        if memory is not None:
            n = max(
                n,
                int(math.ceil(parse_bytes(memory) /
                              self._memory_per_worker())))

        if cores is not None:
            n = max(n, int(math.ceil(cores / self._threads_per_worker())))

        if len(self.worker_spec) > n:
            not_yet_launched = set(self.worker_spec) - {
                v["name"]
                for v in self.scheduler_info["workers"].values()
            }
            while len(self.worker_spec) > n and not_yet_launched:
                del self.worker_spec[not_yet_launched.pop()]

        while len(self.worker_spec) > n:
            self.worker_spec.popitem()

        if self.status not in (Status.closing, Status.closed):
            while len(self.worker_spec) < n:
                self.worker_spec.update(self.new_worker_spec())

        self.loop.add_callback(self._correct_state)

        if self.asynchronous:
            return NoOpAwaitable()
Exemple #7
0
def test_job_script(tmpdir):
    log_directory = tmpdir.strpath
    with SGECluster(
            cores=6,
            processes=2,
            memory="12GB",
            queue="my-queue",
            project="my-project",
            walltime="02:00:00",
            env_extra=["export MY_VAR=my_var"],
            job_extra=["-w e", "-m e"],
            log_directory=log_directory,
            resource_spec="h_vmem=12G,mem_req=12G",
    ) as cluster:
        job_script = cluster.job_script()
        formatted_bytes = format_bytes(parse_bytes("6GB")).replace(" ", "")

        for each in [
                "--nprocs 2",
                "--nthreads 3",
                f"--memory-limit {formatted_bytes}",
                "-q my-queue",
                "-P my-project",
                "-l h_rt=02:00:00",
                "export MY_VAR=my_var",
                "#$ -w e",
                "#$ -m e",
                "#$ -e {}".format(log_directory),
                "#$ -o {}".format(log_directory),
                "-l h_vmem=12G,mem_req=12G",
                "#$ -cwd",
                "#$ -j y",
        ]:
            assert each in job_script
Exemple #8
0
def disk(clear=False, size_limit=disk_size_limit, eviction_policy='least-recently-stored'):
    '''Stored cached values using the diskcache library.

    The path to store the cache is: ~/.vaex/cache/diskcache

    :param int or str size_limit: Max size of cache in bytes (or use a string like '128MB')
        See http://www.grantjenks.com/docs/diskcache/tutorial.html?highlight=eviction#tutorial-settings for more details.
    :param str eviction_policy: Eviction policy,
        See http://www.grantjenks.com/docs/diskcache/tutorial.html?highlight=eviction#tutorial-eviction-policies
    :param bool clear: Remove all disk space used for caching before turning on cache.
    '''
    from dask.utils import parse_bytes
    size_limit = parse_bytes(size_limit)
    global cache
    old_cache = cache
    path = vaex.utils.get_private_dir('cache/diskcache')
    if clear:
        try:
            log.debug(f"Clearing disk cache: {path}")
            shutil.rmtree(path)
        except OSError:  # Windows wonkiness
            log.exception(f"Error clearing disk cache: {path}")
    log.debug(f"Initializing disk cache: {path}")
    cache = diskcache.Cache(path, size_limit=size_limit, eviction_policy=eviction_policy)
    yield
    log.debug("Restored old cache")
    cache = old_cache
Exemple #9
0
def worker_count_info(client, gpu_sizes=["16GB", "32GB", "40GB"], tol="2.1GB"):
    """
    Method accepts the Client object, GPU sizes and tolerance limit and returns
    a dictionary containing number of workers per GPU size specified
    """
    counts_by_gpu_size = dict.fromkeys(gpu_sizes, 0)
    worker_info = client.scheduler_info()["workers"]
    for worker, info in worker_info.items():
        # Assumption is that a node is homogeneous (on a specific node all gpus have the same size)
        worker_device_memory = info["gpu"]["memory-total"][0]
        for gpu_size in gpu_sizes:
            if abs(parse_bytes(gpu_size) - worker_device_memory) < parse_bytes(tol):
                counts_by_gpu_size[gpu_size] += 1
                break

    return counts_by_gpu_size
    def test_batch_files_single(self):
        # Arrange
        instrument_ids = self.catalog.instruments()["id"].unique().tolist()
        base = BacktestDataConfig(
            catalog_path=str(self.catalog.path),
            catalog_fs_protocol=self.catalog.fs.protocol,
            data_cls_path="nautilus_trader.model.orderbook.data.OrderBookData",
        )

        iter_batches = batch_files(
            catalog=self.catalog,
            data_configs=[
                base.replace(instrument_id=instrument_ids[0]),
                base.replace(instrument_id=instrument_ids[1]),
            ],
            target_batch_size_bytes=parse_bytes("10kib"),
            read_num_rows=300,
        )

        # Act
        timestamp_chunks = []
        for batch in iter_batches:
            timestamp_chunks.append([b.ts_init for b in batch])

        # Assert
        latest_timestamp = 0
        for timestamps in timestamp_chunks:
            assert max(timestamps) > latest_timestamp
            latest_timestamp = max(timestamps)
            assert timestamps == sorted(timestamps)
Exemple #11
0
def disk(clear=False, size_limit=vaex.settings.cache.disk_size_limit, eviction_policy="least-recently-stored"):
    """Stored cached values using the diskcache library.

    See configuration details at `configuration of cache <conf.html#disk-size-limit>`_. and `configuration of paths <conf.html#cache-compute>`_

    :param int or str size_limit: Max size of cache in bytes (or use a string like '128MB')
        See http://www.grantjenks.com/docs/diskcache/tutorial.html?highlight=eviction#tutorial-settings for more details.
    :param str eviction_policy: Eviction policy,
        See http://www.grantjenks.com/docs/diskcache/tutorial.html?highlight=eviction#tutorial-eviction-policies
    :param bool clear: Remove all disk space used for caching before turning on cache.
    """
    from dask.utils import parse_bytes
    size_limit = parse_bytes(size_limit)
    global cache
    old_cache = cache
    path = vaex.settings.cache.path
    if clear:
        try:
            log.debug(f"Clearing disk cache: {path}")
            shutil.rmtree(path)
        except OSError:  # Windows wonkiness
            log.exception(f"Error clearing disk cache: {path}")
    log.debug(f"Initializing disk cache: {path}")
    cache = diskcache.Cache(path, size_limit=size_limit, eviction_policy=eviction_policy)
    yield
    log.debug("Restored old cache")
    cache = old_cache
    def test_batch_generic_data(self):
        # Arrange
        TestStubs.setup_news_event_persistence()
        process_files(
            glob_path=f"{PACKAGE_ROOT}/data/news_events.csv",
            reader=CSVReader(block_parser=TestStubs.news_event_parser),
            catalog=self.catalog,
        )
        data_config = BacktestDataConfig(
            catalog_path="/root/",
            catalog_fs_protocol="memory",
            data_cls_path=f"{NewsEventData.__module__}.NewsEventData",
            client_id="NewsClient",
        )
        # Add some arbitrary instrument data to appease BacktestEngine
        instrument_data_config = BacktestDataConfig(
            catalog_path="/root/",
            catalog_fs_protocol="memory",
            instrument_id=self.catalog.instruments(as_nautilus=True)[0].id.value,
            data_cls_path=f"{InstrumentStatusUpdate.__module__}.InstrumentStatusUpdate",
        )
        run_config = BacktestRunConfig(
            data=[data_config, instrument_data_config],
            persistence=BetfairTestStubs.persistence_config(catalog_path=self.catalog.path),
            venues=[BetfairTestStubs.betfair_venue_config()],
            strategies=[],
            batch_size_bytes=parse_bytes("1mib"),
        )

        # Act
        node = BacktestNode()
        node.run_sync([run_config])

        # Assert
        assert node
Exemple #13
0
def test_job_script():
    with OARCluster(walltime="00:02:00", processes=4, cores=8,
                    memory="28GB") as cluster:
        job_script = cluster.job_script()
        assert "#OAR" in job_script
        assert "#OAR -n dask-worker" in job_script
        formatted_bytes = format_bytes(parse_bytes("7GB")).replace(" ", "")
        assert f"--memory-limit {formatted_bytes}" in job_script
        assert "#OAR -l /nodes=1/core=8,walltime=00:02:00" in job_script
        assert "#OAR --project" not in job_script
        assert "#OAR -q" not in job_script

        assert "export " not in job_script

        assert ("{} -m distributed.cli.dask_worker tcp://".format(
            sys.executable) in job_script)
        formatted_bytes = format_bytes(parse_bytes("7GB")).replace(" ", "")
        assert f"--nthreads 2 --nprocs 4 --memory-limit {formatted_bytes}" in job_script

    with OARCluster(
            walltime="00:02:00",
            processes=4,
            cores=8,
            memory="28GB",
            env_extra=[
                'export LANG="en_US.utf8"',
                'export LANGUAGE="en_US.utf8"',
                'export LC_ALL="en_US.utf8"',
            ],
    ) as cluster:
        job_script = cluster.job_script()
        assert "#OAR" in job_script
        assert "#OAR -n dask-worker" in job_script
        formatted_bytes = format_bytes(parse_bytes("7GB")).replace(" ", "")
        assert f"--memory-limit {formatted_bytes}" in job_script
        assert "#OAR -l /nodes=1/core=8,walltime=00:02:00" in job_script
        assert "#OAR --project" not in job_script
        assert "#OAR -q" not in job_script

        assert 'export LANG="en_US.utf8"' in job_script
        assert 'export LANGUAGE="en_US.utf8"' in job_script
        assert 'export LC_ALL="en_US.utf8"' in job_script

        assert ("{} -m distributed.cli.dask_worker tcp://".format(
            sys.executable) in job_script)
        formatted_bytes = format_bytes(parse_bytes("7GB")).replace(" ", "")
        assert f"--nthreads 2 --nprocs 4 --memory-limit {formatted_bytes}" in job_script
Exemple #14
0
def make_raw_files(glob_path,
                   block_size="128mb",
                   compression="infer",
                   **kw) -> List[RawFile]:
    files = scan_files(glob_path, compression=compression, **kw)
    return [
        RawFile(open_file=f, block_size=parse_bytes(block_size)) for f in files
    ]
Exemple #15
0
def _internal_read_csv(path, chunksize="256 MiB", **kwargs):
    if isinstance(chunksize, str):
        chunksize = parse_bytes(chunksize)

    if isinstance(path, list):
        filenames = path
    elif isinstance(path, str):
        filenames = sorted(glob(path))
    elif hasattr(path, "__fspath__"):
        filenames = sorted(glob(path.__fspath__()))
    else:
        raise TypeError("Path type not understood:{}".format(type(path)))

    if not filenames:
        msg = f"A file in: {filenames} does not exist."
        raise FileNotFoundError(msg)

    name = "read-csv-" + tokenize(path, tokenize, **
                                  kwargs)  # TODO: get last modified time

    compression = kwargs.get("compression", False)
    if compression and chunksize:
        # compressed CSVs reading must read the entire file
        kwargs.pop("byte_range", None)
        warn("Warning %s compression does not support breaking apart files\n"
             "Please ensure that each individual file can fit in memory and\n"
             "use the keyword ``chunksize=None to remove this message``\n"
             "Setting ``chunksize=(size of file)``" % compression)
        chunksize = None

    if chunksize is None:
        return read_csv_without_chunksize(path, **kwargs)

    dask_reader = make_reader(cudf.read_csv, "read_csv", "CSV")
    meta = dask_reader(filenames[0], **kwargs)._meta

    dsk = {}
    i = 0
    dtypes = meta.dtypes.values

    for fn in filenames:
        size = os.path.getsize(fn)
        for start in range(0, size, chunksize):
            kwargs2 = kwargs.copy()
            kwargs2["byte_range"] = (
                start,
                chunksize,
            )  # specify which chunk of the file we care about
            if start != 0:
                kwargs2[
                    "names"] = meta.columns  # no header in the middle of the file
                kwargs2["header"] = None
            dsk[(name, i)] = (apply, _read_csv, [fn, dtypes], kwargs2)

            i += 1

    divisions = [None] * (len(dsk) + 1)
    return dd.core.new_dd_object(dsk, name, meta, divisions)
Exemple #16
0
def batch_files(  # noqa: C901
    catalog: DataCatalog,
    data_configs: List[BacktestDataConfig],
    read_num_rows: int = 10000,
    target_batch_size_bytes: int = parse_bytes("100mb"),  # noqa: B008,
):
    files = build_filenames(catalog=catalog, data_configs=data_configs)
    buffer = {fn.filename: pd.DataFrame() for fn in files}
    datasets = {
        f.filename: dataset_batches(file_meta=f, fs=catalog.fs, n_rows=read_num_rows) for f in files
    }
    completed: Set[str] = set()
    bytes_read = 0
    values = []
    sent_count = 0
    while set([f.filename for f in files]) != completed:
        # Fill buffer (if required)
        for fn in buffer:
            if len(buffer[fn]) < read_num_rows:
                next_buf = next(datasets[fn], None)
                if next_buf is None:
                    completed.add(fn)
                    continue
                buffer[fn] = pd.concat([buffer[fn], next_buf])

        # Determine minimum timestamp
        max_ts_per_frame = {fn: df["ts_init"].max() for fn, df in buffer.items() if not df.empty}
        if not max_ts_per_frame:
            continue
        min_ts = min(max_ts_per_frame.values())

        # Filter buffer dataframes based on min_timestamp
        batches = []
        for f in files:
            df = buffer[f.filename]
            if df.empty:
                continue
            ts_filter = df["ts_init"] <= min_ts  # min of max timestamps
            batch = df[ts_filter]
            buffer[f.filename] = df[~ts_filter]
            objs = frame_to_nautilus(df=batch, cls=f.datatype)
            batches.append(objs)
            bytes_read += sum([sys.getsizeof(x) for x in objs])

        # Merge ticks
        values.extend(list(heapq.merge(*batches, key=lambda x: x.ts_init)))
        if bytes_read > target_batch_size_bytes:
            yield values
            sent_count += len(values)
            bytes_read = 0
            values = []

    if values:
        yield values
        sent_count += len(values)

    if sent_count == 0:
        raise ValueError("No data found, check data_configs")
Exemple #17
0
def test_getitem_avoids_large_chunks():
    a = np.arange(4 * 500 * 500).reshape(4, 500, 500)
    arr = da.from_array(a, chunks=(1, 500, 500))
    indexer = [0, 1] + [2] * 100 + [3]
    result = arr[indexer]
    chunk_size = utils.parse_bytes(config.get("array.chunk-size"))
    assert all(x.nbytes < chunk_size for x in result.blocks)
    expected = a[indexer]

    assert_eq(result, expected)
Exemple #18
0
def batch_files(
    catalog: DataCatalog,
    data_configs: List[BacktestDataConfig],
    read_num_rows: int = 10000,
    target_batch_size_bytes: int = parse_bytes("100mb"),  # noqa: B008
):
    files = build_filenames(catalog=catalog, data_configs=data_configs)
    buffer = {fn.filename: pd.DataFrame() for fn in files}
    datasets = {
        f.filename: dataset_batches(file_meta=f, fs=catalog.fs, n_rows=read_num_rows) for f in files
    }
    completed: Set[str] = set()
    bytes_read = 0
    values = []
    while set([f.filename for f in files]) != completed:
        # Fill buffer (if required)
        for fn in buffer:
            if len(buffer[fn]) < read_num_rows:
                next_buf = next(datasets[fn], None)
                if next_buf is None:
                    completed.add(fn)
                    continue
                buffer[fn] = buffer[fn].append(next_buf)

        # Determine minimum timestamp
        max_ts_per_frame = [df["ts_init"].max() for df in buffer.values() if not df.empty]
        if not max_ts_per_frame:
            continue
        min_ts = min(max_ts_per_frame)

        # Filter buffer dataframes based on min_timestamp
        batches = []
        for f in files:
            df = buffer[f.filename]
            if df.empty:
                continue
            ts_filter = df["ts_init"] <= min_ts
            batch = df[ts_filter]
            buffer[f.filename] = df[~ts_filter]
            # print(f"{f.filename} batch={len(batch)} buffer={len(buffer)}")
            objs = frame_to_nautilus(df=batch, cls=f.datatype)
            batches.append(objs)
            bytes_read += sum([sys.getsizeof(x) for x in objs])

        # Merge ticks
        values.extend(list(heapq.merge(*batches, key=lambda x: x.ts_init)))
        # print(f"iter complete, {bytes_read=}, flushing at target={target_batch_size_bytes}")
        if bytes_read > target_batch_size_bytes:
            yield values
            bytes_read = 0
            values = []

    if values:
        yield values
    def test_backtest_run_streaming_sync(self):
        # Arrange
        node = BacktestNode()
        base = self.backtest_configs[0]
        config = base.replace(strategies=self.strategies,
                              batch_size_bytes=parse_bytes("10kib"))

        # Act
        results = node.run_sync([config])

        # Assert
        assert len(results) == 1
def worker_count_info(client):
    """
    Method accepts the Client object and returns a dictionary
    containing number of workers per GPU size specified

    Assumes all GPUs are of the same type.
    """
    gpu_sizes = ["16GB", "32GB", "40GB"]
    counts_by_gpu_size = dict.fromkeys(gpu_sizes, 0)
    tolerance = "2.6GB"

    worker_info = client.scheduler_info()["workers"]
    for worker, info in worker_info.items():
        worker_device_memory = info["gpu"]["memory-total"]
        for gpu_size in gpu_sizes:
            if abs(parse_bytes(gpu_size) -
                   worker_device_memory) < parse_bytes(tolerance):
                counts_by_gpu_size[gpu_size] += 1
                break

    return counts_by_gpu_size
Exemple #21
0
def test_job_script(Cluster):
    with Cluster(walltime="00:02:00", processes=4, cores=8,
                 memory="28GB") as cluster:

        job_script = cluster.job_script()
        assert "#PBS" in job_script
        assert "#PBS -N dask-worker" in job_script
        assert "#PBS -l select=1:ncpus=8:mem=27GB" in job_script
        assert "#PBS -l walltime=00:02:00" in job_script
        assert "#PBS -q" not in job_script
        assert "#PBS -A" not in job_script

        assert ("{} -m distributed.cli.dask_worker tcp://".format(
            sys.executable) in job_script)
        formatted_bytes = format_bytes(parse_bytes("7GB")).replace(" ", "")
        assert f"--nthreads 2 --nprocs 4 --memory-limit {formatted_bytes}" in job_script

    with Cluster(
            queue="regular",
            project="DaskOnPBS",
            processes=4,
            cores=8,
            resource_spec="select=1:ncpus=24:mem=100GB",
            memory="28GB",
    ) as cluster:

        job_script = cluster.job_script()
        assert "#PBS -q regular" in job_script
        assert "#PBS -N dask-worker" in job_script
        assert "#PBS -l select=1:ncpus=24:mem=100GB" in job_script
        assert "#PBS -l select=1:ncpus=8:mem=27GB" not in job_script
        assert "#PBS -l walltime=" in job_script
        assert "#PBS -A DaskOnPBS" in job_script

        assert ("{} -m distributed.cli.dask_worker tcp://".format(
            sys.executable) in job_script)
        formatted_bytes = format_bytes(parse_bytes("7GB")).replace(" ", "")
        assert f"--nthreads 2 --nprocs 4 --memory-limit {formatted_bytes}" in job_script
Exemple #22
0
    def _memory_per_worker(self) -> int:
        """Return the memory limit per worker for new workers"""
        if not self.new_spec:  # pragma: no cover
            raise ValueError(
                "to scale by memory= your worker definition must include a "
                "memory_limit definition")

        for name in ["memory_limit", "memory"]:
            with suppress(KeyError):
                return parse_bytes(self.new_spec["options"][name])

        raise ValueError(
            "to use scale(memory=...) your worker definition must include a "
            "memory_limit definition")
Exemple #23
0
    def __init__(
        self,
        paths,
        engine=None,
        part_size=None,
        part_mem_fraction=None,
        storage_options=None,
        **kwargs,
    ):
        if part_size:
            # If a specific partition size is given, use it directly
            part_size = parse_bytes(part_size)
        else:
            # If a fractional partition size is given, calculate part_size
            part_mem_fraction = part_mem_fraction or 0.125
            assert part_mem_fraction > 0.0 and part_mem_fraction < 1.0
            if part_mem_fraction > 0.25:
                warnings.warn("Using very large partitions sizes for Dask. "
                              "Memory-related errors are likely.")
            part_size = int(device_mem_size(kind="total") * part_mem_fraction)

        # Engine-agnostic path handling
        if hasattr(paths, "name"):
            paths = stringify_path(paths)
        if isinstance(paths, str):
            paths = [paths]

        storage_options = storage_options or {}
        # If engine is not provided, try to infer from end of paths[0]
        if engine is None:
            engine = paths[0].split(".")[-1]
        if isinstance(engine, str):
            if engine == "parquet":
                self.engine = ParquetDatasetEngine(
                    paths,
                    part_size,
                    storage_options=storage_options,
                    **kwargs)
            elif engine == "csv":
                self.engine = CSVDatasetEngine(paths,
                                               part_size,
                                               storage_options=storage_options,
                                               **kwargs)
            else:
                raise ValueError("Only parquet and csv supported (for now).")
        else:
            self.engine = engine(paths,
                                 part_size,
                                 storage_options=storage_options)
Exemple #24
0
def lsf_unit_detection_helper(expected_unit, conf_text=None):
    temp_dir = tempfile.mkdtemp()
    current_lsf_envdir = os.environ.get("LSF_ENVDIR", None)
    os.environ["LSF_ENVDIR"] = temp_dir
    if conf_text is not None:
        with open(os.path.join(temp_dir, "lsf.conf"), "w") as conf_file:
            conf_file.write(conf_text)
    memory_string = "13GB"
    memory_base = parse_bytes(memory_string)
    correct_memory = lsf.lsf_format_bytes_ceil(memory_base,
                                               lsf_units=expected_unit)
    with LSFCluster(memory=memory_string, cores=1) as cluster:
        assert "#BSUB -M %s" % correct_memory in cluster.job_header
    rmtree(temp_dir)
    if current_lsf_envdir is None:
        del os.environ["LSF_ENVDIR"]
    else:
        os.environ["LSF_ENVDIR"] = current_lsf_envdir
Exemple #25
0
def _byte_block_counts(
    urlpath,
    blocksize,
    lineterminator=None,
    compression="infer",
    storage_options=None,
    **kwargs,
):
    """Return a list of paths and block counts.

    Logic copied from dask.bytes.read_bytes
    """

    if lineterminator is not None and len(lineterminator) == 1:
        kwargs["lineterminator"] = lineterminator
    else:
        lineterminator = "\n"

    if compression == "infer":
        paths = get_fs_token_paths(urlpath,
                                   mode="rb",
                                   storage_options=storage_options)[2]
        compression = infer_compression(paths[0])

    if isinstance(blocksize, str):
        blocksize = parse_bytes(blocksize)
    if blocksize and compression:
        blocksize = None

    b_out = read_bytes(
        urlpath,
        delimiter=lineterminator.encode(),
        blocksize=blocksize,
        sample=False,
        compression=compression,
        include_path=True,
        **(storage_options or {}),
    )
    _, values, paths = b_out

    if not isinstance(values[0], (tuple, list)):
        values = [values]

    return paths, [len(v) for v in values]
Exemple #26
0
    def __init__(
        self,
        path,
        engine=None,
        part_size=None,
        part_mem_fraction=None,
        storage_options=None,
        **kwargs,
    ):
        if part_size:
            # If a specific partition size is given, use it directly
            part_size = parse_bytes(part_size)
        else:
            # If a fractional partition size is given, calculate part_size
            part_mem_fraction = part_mem_fraction or 0.125
            assert part_mem_fraction > 0.0 and part_mem_fraction < 1.0
            if part_mem_fraction > 0.25:
                warnings.warn("Using very large partitions sizes for Dask. "
                              "Memory-related errors are likely.")
            part_size = int(cuda.current_context().get_memory_info()[1] *
                            part_mem_fraction)

        # Engine-agnostic path handling
        if hasattr(path, "name"):
            path = stringify_path(path)
        storage_options = storage_options or {}
        fs, fs_token, paths = get_fs_token_paths(
            path, mode="rb", storage_options=storage_options)
        paths = sorted(paths, key=natural_sort_key)

        # If engine is not provided, try to infer from end of paths[0]
        if engine is None:
            engine = paths[0].split(".")[-1]
        if isinstance(engine, str):
            if engine == "parquet":
                self.engine = ParquetDatasetEngine(paths, part_size, fs,
                                                   fs_token, **kwargs)
            elif engine == "csv":
                self.engine = CSVDatasetEngine(paths, part_size, fs, fs_token,
                                               **kwargs)
            else:
                raise ValueError("Only parquet and csv supported (for now).")
        else:
            self.engine = engine(paths, part_size, fs, fs_token, **kwargs)
Exemple #27
0
def read_csv(path, chunksize="256 MiB", **kwargs):
    if isinstance(chunksize, str):
        chunksize = parse_bytes(chunksize)
    filenames = sorted(glob(str(path)))  # TODO: lots of complexity
    name = "read-csv-" + tokenize(path, tokenize, **
                                  kwargs)  # TODO: get last modified time

    compression = kwargs.get("compression", False)
    if compression:
        # compressed CSVs reading must read the entire file
        kwargs.pop("byte_range", None)
        warn("Warning %s compression does not support breaking apart files\n"
             "Please ensure that each individual file can fit in memory and\n"
             "use the keyword ``chunksize=None to remove this message``\n"
             "Setting ``chunksize=(size of file)``" % compression)
        chunksize = None

    if chunksize is None:
        return read_csv_without_chunksize(path, **kwargs)

    meta = cudf.read_csv(filenames[0], **kwargs)
    dsk = {}
    i = 0
    dtypes = meta.dtypes.values

    for fn in filenames:
        size = os.path.getsize(fn)
        for start in range(0, size, chunksize):
            kwargs2 = kwargs.copy()
            kwargs2["byte_range"] = (
                start,
                chunksize,
            )  # specify which chunk of the file we care about
            if start != 0:
                kwargs2[
                    "names"] = meta.columns  # no header in the middle of the file
                kwargs2["header"] = None
            dsk[(name, i)] = (apply, _read_csv, [fn, dtypes], kwargs2)

            i += 1

    divisions = [None] * (len(dsk) + 1)
    return dd.core.new_dd_object(dsk, name, meta, divisions)
def parse_memory_limit(
    memory_limit: str | float, nthreads: int, total_cores: int = CPU_COUNT
) -> int | None:
    if memory_limit is None:
        return None

    if memory_limit == "auto":
        memory_limit = int(system.MEMORY_LIMIT * min(1, nthreads / total_cores))
    with suppress(ValueError, TypeError):
        memory_limit = float(memory_limit)
        if isinstance(memory_limit, float) and memory_limit <= 1:
            memory_limit = int(memory_limit * system.MEMORY_LIMIT)

    if isinstance(memory_limit, str):
        memory_limit = parse_bytes(memory_limit)
    else:
        memory_limit = int(memory_limit)

    assert isinstance(memory_limit, int)
    if memory_limit == 0:
        return None
    return min(memory_limit, system.MEMORY_LIMIT)
Exemple #29
0
def parse_device_memory_limit(device_memory_limit, device_index=0):
    """Parse memory limit to be used by a CUDA device.


    Parameters
    ----------
    device_memory_limit: float, int, str or None
        This can be a float (fraction of total device memory), an integer (bytes),
        a string (like 5GB or 5000M), and "auto", 0 or None for the total device
        size.
    device_index: int
        The index of device from which to obtain the total memory amount.

    Examples
    --------
    >>> # On a 32GB CUDA device
    >>> parse_device_memory_limit(None)
    34089730048
    >>> parse_device_memory_limit(0.8)
    27271784038
    >>> parse_device_memory_limit(1000000000)
    1000000000
    >>> parse_device_memory_limit("1GB")
    1000000000
    """
    if any(device_memory_limit == v for v in [0, "0", None, "auto"]):
        return get_device_total_memory(device_index)

    with suppress(ValueError, TypeError):
        device_memory_limit = float(device_memory_limit)
        if isinstance(device_memory_limit, float) and device_memory_limit <= 1:
            return int(
                get_device_total_memory(device_index) * device_memory_limit)

    if isinstance(device_memory_limit, str):
        return parse_bytes(device_memory_limit)
    else:
        return int(device_memory_limit)
Exemple #30
0
def convert_chunk_size(chunk_size, factor, dtype, masked):
    """Convert a chunk size given as a string to number of elements.

    Args:
        chunk_size (None, int, str): Chunk size. Conversion using this function is
            only required if a string is given - other values are simply returned.
        factor (int): Number of elements in dimensions which are not chunked, e.g. the
            number of rows if chunking is done exclusively along columns. This is used
            to compute the size per chunk.
        dtype (numpy dtype): Data dtype.
        masked (bool): If True, an additional byte per element will be taken into
            account.

    Returns:
        int or None: Maximum number of elements to stay below the given `chunk_size`.
            If `chunk_size` is None, None will be returned.

    """
    if chunk_size is None or isinstance(chunk_size, int):
        # Do nothing.
        return chunk_size
    element_size = (dtype.itemsize + (1 if masked else 0)) * factor
    return parse_bytes(chunk_size) // element_size
Exemple #31
0
def memory(maxsize=vaex.settings.cache.memory_size_limit, classname="LRUCache", clear=False):
    """Sets a memory cache using cachetools (https://cachetools.readthedocs.io/).

    Calling multiple times with clear=False will keep the current cache (useful in notebook usage).

    :param int or str maxsize: Max size of cache in bytes (or use a string like '128MB')
    :param str classname: classname in the cachetools library used for the cache (e.g. LRUCache, MRUCache).
    :param bool clear: If False, will always set a new cache, when true, it will keep the cache when it is of the same type.
    """
    global cache
    from dask.utils import parse_bytes

    maxsize = parse_bytes(maxsize)
    log.debug("set cache to memory (cachetools)")
    old_cache = cache
    if isinstance(classname, str):
        cls = getattr(cachetools, classname)
    else:
        cls = classname
    if clear or type(cache) != cls:
        cache = cls(maxsize=maxsize)
    yield
    log.debug("restore old cache")
    cache = old_cache