Example #1
0
    def adapt(self, minimum, maximum, cores=1, memory='1 GB', **kwargs):

        # Check if given memory is greater than maximum allowed
        if parse_bytes(memory) > parse_bytes(self.max_memory):
            raise MemoryError(' '.join([
                'could not allocate {:s} of memory,'.format(memory),
                'maximum allowed is {:s}'.format(self.max_memory)
            ]))

        # Check if given memory is lower than minimum allowed
        if parse_bytes(memory) < parse_bytes(self.min_memory):
            raise MemoryError(' '.join([
                'could not allocate {:s} of memory,'.format(memory),
                'minimum allowed is {:s}'.format(self.min_memory)
            ]))

        # Check if number of cores is greater than maximum allowed
        if cores > self.max_cores:
            raise Exception(' '.join([
                'could not allocate {:d} cores,'.format(cores),
                'maximum allowed is {:d}'.format(self.max_cores)
            ]))

        # Check if number of cores is lower than minimum allowed
        if cores < self.min_cores:
            raise Exception(' '.join([
                'could not allocate {:d} cores,'.format(cores),
                'minimum allowed is {:d}'.format(self.min_cores)
            ]))
Example #2
0
def test_parse_bytes():
    assert parse_bytes('100') == 100
    assert parse_bytes('100 MB') == 100000000
    assert parse_bytes('100M') == 100000000
    assert parse_bytes('5kB') == 5000
    assert parse_bytes('5.4 kB') == 5400
    assert parse_bytes('1kiB') == 1024
    assert parse_bytes('1Mi') == 2**20
    assert parse_bytes('1e6') == 1000000
    assert parse_bytes('1e6 kB') == 1000000000
    assert parse_bytes('MB') == 1000000
Example #3
0
def test_parse_bytes():
    assert parse_bytes("100") == 100
    assert parse_bytes("100 MB") == 100000000
    assert parse_bytes("100M") == 100000000
    assert parse_bytes("5kB") == 5000
    assert parse_bytes("5.4 kB") == 5400
    assert parse_bytes("1kiB") == 1024
    assert parse_bytes("1Mi") == 2**20
    assert parse_bytes("1e6") == 1000000
    assert parse_bytes("1e6 kB") == 1000000000
    assert parse_bytes("MB") == 1000000
Example #4
0
def test_parse_bytes():
    assert parse_bytes('100') == 100
    assert parse_bytes('100 MB') == 100000000
    assert parse_bytes('100M') == 100000000
    assert parse_bytes('5kB') == 5000
    assert parse_bytes('5.4 kB') == 5400
    assert parse_bytes('1kiB') == 1024
    assert parse_bytes('1Mi') == 2**20
    assert parse_bytes('1e6') == 1000000
    assert parse_bytes('1e6 kB') == 1000000000
    assert parse_bytes('MB') == 1000000
Example #5
0
    def __init__(
        self,
        scheduler=None,
        name=None,
        disk=None,
        job_extra=None,
        config_name=None,
        **base_class_kwargs
    ):
        super().__init__(
            scheduler=scheduler, name=name, config_name=config_name, **base_class_kwargs
        )

        if disk is None:
            disk = dask.config.get("jobqueue.%s.disk" % self.config_name)
        if disk is None:
            raise ValueError(
                "You must specify how much disk to use per job like ``disk='1 GB'``"
            )
        self.worker_disk = parse_bytes(disk)
        if job_extra is None:
            self.job_extra = dask.config.get(
                "jobqueue.%s.job-extra" % self.config_name, {}
            )
        else:
            self.job_extra = job_extra

        env_extra = base_class_kwargs.get("env_extra", None)
        if env_extra is None:
            env_extra = dask.config.get(
                "jobqueue.%s.env-extra" % self.config_name, default=[]
            )
        self.env_dict = self.env_lines_to_dict(env_extra)

        self.job_header_dict = {
            "MY.DaskWorkerName": '"htcondor--$F(MY.JobId)--"',
            "RequestCpus": "MY.DaskWorkerCores",
            "RequestMemory": "floor(MY.DaskWorkerMemory / 1048576)",
            "RequestDisk": "floor(MY.DaskWorkerDisk / 1024)",
            "MY.JobId": '"$(ClusterId).$(ProcId)"',
            "MY.DaskWorkerCores": self.worker_cores,
            "MY.DaskWorkerMemory": self.worker_memory,
            "MY.DaskWorkerDisk": self.worker_disk,
        }
        if self.log_directory:
            self.job_header_dict.update(
                {
                    "LogDirectory": self.log_directory,
                    # $F(...) strips quotes
                    "Output": "$(LogDirectory)/worker-$F(MY.JobId).out",
                    "Error": "$(LogDirectory)/worker-$F(MY.JobId).err",
                    "Log": "$(LogDirectory)/worker-$(ClusterId).log",
                    # We kill all the workers to stop them so we need to stream their
                    # output+error if we ever want to see anything
                    "Stream_Output": True,
                    "Stream_Error": True,
                }
            )
        if self.job_extra:
            self.job_header_dict.update(self.job_extra)
Example #6
0
    def __init__(self,
                 disk=None,
                 job_extra=None,
                 schedd=None,
                 config_name="htcondor",
                 **kwargs):
        if disk is None:
            disk = dask.config.get("jobqueue.%s.disk" % config_name)
        if disk is None:
            raise ValueError(
                "You must specify how much disk to use per job like ``disk='1 GB'``"
            )
        self.worker_disk = parse_bytes(disk)
        if job_extra is None:
            self.job_extra = dask.config.get(
                "jobqueue.%s.job-extra" % config_name, {})
        else:
            self.job_extra = job_extra
        if schedd is None:
            schedd = dask.config.get("jobqueue.%s.schedd" % config_name, None)

        # Instantiate args and parameters from parent abstract class
        super().__init__(config_name=config_name, **kwargs)

        env_extra = kwargs.get("env_extra", None)
        if env_extra is None:
            env_extra = dask.config.get("jobqueue.%s.env-extra" % config_name,
                                        default=[])
        self.env_dict = self.env_lines_to_dict(env_extra)
        self.env_dict["JOB_ID"] = "$F(MY.JobId)"

        self.job_header_dict = {
            "MY.DaskWorkerName": '"htcondor--$F(MY.JobId)--"',
            "RequestCpus": "MY.DaskWorkerCores",
            "RequestMemory": "floor(MY.DaskWorkerMemory / 1048576)",
            "RequestDisk": "floor(MY.DaskWorkerDisk / 1024)",
            "MY.JobId": '"$(ClusterId).$(ProcId)"',
            "MY.DaskWorkerCores": self.worker_cores,
            "MY.DaskWorkerMemory": self.worker_memory,
            "MY.DaskWorkerDisk": self.worker_disk,
        }
        if self.log_directory:
            self.job_header_dict.update({
                "LogDirectory": self.log_directory,
                # $F(...) strips quotes
                "Output": "$(LogDirectory)/worker-$F(MY.JobId).out",
                "Error": "$(LogDirectory)/worker-$F(MY.JobId).err",
                "Log": "$(LogDirectory)/worker-$(ClusterId).log",
                # We kill all the workers to stop them so we need to stream their
                # output+error if we ever want to see anything
                "Stream_Output": True,
                "Stream_Error": True,
            })
        if self.job_extra:
            self.job_header_dict.update(self.job_extra)
        if schedd:
            self.submit_command += " -name " + shlex.quote(schedd)
            self.cancel_command += " -name " + shlex.quote(schedd)
Example #7
0
def lsf_unit_detection_helper(expected_unit, conf_text=None):
    temp_dir = tempfile.mkdtemp()
    current_lsf_envdir = os.environ.get("LSF_ENVDIR", None)
    os.environ["LSF_ENVDIR"] = temp_dir
    if conf_text is not None:
        with open(os.path.join(temp_dir, "lsf.conf"), "w") as conf_file:
            conf_file.write(conf_text)
    memory_string = "13GB"
    memory_base = parse_bytes(memory_string)
    correct_memory = lsf.lsf_format_bytes_ceil(memory_base, lsf_units=expected_unit)
    with LSFCluster(memory=memory_string, cores=1) as cluster:
        assert "#BSUB -M %s" % correct_memory in cluster.job_header
    rmtree(temp_dir)
    if current_lsf_envdir is None:
        del os.environ["LSF_ENVDIR"]
    else:
        os.environ["LSF_ENVDIR"] = current_lsf_envdir
Example #8
0
def _update_lsf_settings():
    from dask_jobqueue import LSFCluster  #@UnresolvedImport @UnusedImport
    # 'ncpus' is how many CPUs are RESERVED for the LSF job.
    # By default, set it to the number of CPUs the workers will actually use ('cores')
    ncpus = dask.config.get("jobqueue.lsf.ncpus", -1)
    if not ncpus or ncpus == -1:
        ncpus = dask.config.get("jobqueue.lsf.cores")
        dask.config.set({"jobqueue.lsf.ncpus": ncpus})

    # Similar to above, the difference between 'mem' and 'memory' is that the former
    # specifies the memory to reserve in LSF, whereas the latter is actually used
    # by Dask workers to determine when they've exceeded their limits.
    mem = dask.config.get("jobqueue.lsf.mem", -1)
    if not mem or mem == -1:
        memory = dask.config.get("jobqueue.lsf.memory", None)
        if memory:
            mem = parse_bytes(memory)
            dask.config.set({"jobqueue.lsf.mem": mem})
Example #9
0
def parse_device_memory_limit(device_memory_limit, device_index=0):
    """Parse memory limit to be used by a CUDA device.


    Parameters
    ----------
    device_memory_limit: float, int, str or None
        This can be a float (fraction of total device memory), an integer (bytes),
        a string (like 5GB or 5000M), and "auto", 0 or None for the total device
        size.
    device_index: int
        The index of device from which to obtain the total memory amount.

    Examples
    --------
    >>> # On a 32GB CUDA device
    >>> parse_device_memory_limit(None)
    34089730048
    >>> parse_device_memory_limit(0.8)
    27271784038
    >>> parse_device_memory_limit(1000000000)
    1000000000
    >>> parse_device_memory_limit("1GB")
    1000000000
    """
    if any(device_memory_limit == v for v in [0, "0", None, "auto"]):
        return get_device_total_memory(device_index)

    with suppress(ValueError, TypeError):
        device_memory_limit = float(device_memory_limit)
        if isinstance(device_memory_limit, float) and device_memory_limit <= 1:
            return int(
                get_device_total_memory(device_index) * device_memory_limit)

    if isinstance(device_memory_limit, str):
        return parse_bytes(device_memory_limit)
    else:
        return int(device_memory_limit)
Example #10
0
 def __init__(self, template, options, tmpl_dir, backend="sge", debug=False):
     # pdb.set_trace()
     self._template = template  # for __repr__
     self.setup = compile_template(
         "module", tmpl_dir, debug, package=" ".join(options["module"])
     )
     self.job_cmd = compile_template(template, tmpl_dir, debug, **options)
     jobopts = {
         **options[backend],
         "memory": "{}".format(parse_bytes(options[backend]["memory"])),
         "name": f"{template}-{uuid4()}",
         "nprocs": options.get("nprocs", 1),
     }
     # TODO: check walltime and cputime format
     # TODO: check if queue is valid
     self.job_header = compile_template(backend, tmpl_dir, debug, **jobopts)
     self.script = compile_template(
         "jobscript",
         tmpl_dir,
         debug,
         job_header=self.job_header,
         setup=self.setup,
         job_cmd=self.job_cmd,
     )
 def _get_nb_workers_from_memory(self, memory):
     return math.ceil(
         parse_bytes(memory) /
         parse_bytes(self.jobqueue_worker_spec["memory"]))
Example #12
0
    def __init__(
        self,
        n_workers=None,
        threads_per_worker=1,
        processes=True,
        memory_limit="auto",
        device_memory_limit=None,
        CUDA_VISIBLE_DEVICES=None,
        data=None,
        local_directory=None,
        protocol=None,
        enable_tcp_over_ucx=False,
        enable_infiniband=False,
        enable_nvlink=False,
        enable_rdmacm=False,
        ucx_net_devices=None,
        rmm_pool_size=None,
        **kwargs,
    ):
        if CUDA_VISIBLE_DEVICES is None:
            CUDA_VISIBLE_DEVICES = cuda_visible_devices(0)
        if isinstance(CUDA_VISIBLE_DEVICES, str):
            CUDA_VISIBLE_DEVICES = CUDA_VISIBLE_DEVICES.split(",")
        CUDA_VISIBLE_DEVICES = list(map(int, CUDA_VISIBLE_DEVICES))
        if n_workers is None:
            n_workers = len(CUDA_VISIBLE_DEVICES)
        self.host_memory_limit = parse_memory_limit(memory_limit,
                                                    threads_per_worker,
                                                    n_workers)
        self.device_memory_limit = device_memory_limit

        self.rmm_pool_size = rmm_pool_size
        if rmm_pool_size is not None:
            try:
                import rmm  # noqa F401
            except ImportError:
                raise ValueError(
                    "RMM pool requested but module 'rmm' is not available. "
                    "For installation instructions, please see "
                    "https://github.com/rapidsai/rmm")  # pragma: no cover
            self.rmm_pool_size = parse_bytes(self.rmm_pool_size)

        if not processes:
            raise ValueError(
                "Processes are necessary in order to use multiple GPUs with Dask"
            )

        if self.device_memory_limit is None:
            self.device_memory_limit = get_device_total_memory(0)
        elif isinstance(self.device_memory_limit, str):
            self.device_memory_limit = parse_bytes(self.device_memory_limit)

        if data is None:
            data = (
                DeviceHostFile,
                {
                    "device_memory_limit":
                    self.device_memory_limit,
                    "memory_limit":
                    self.host_memory_limit,
                    "local_directory":
                    local_directory or dask.config.get("temporary-directory")
                    or os.getcwd(),
                },
            )

        if enable_tcp_over_ucx or enable_infiniband or enable_nvlink:
            if protocol is None:
                protocol = "ucx"
            elif protocol != "ucx":
                raise TypeError(
                    "Enabling InfiniBand or NVLink requires protocol='ucx'")

        if ucx_net_devices == "auto":
            try:
                from ucp._libs.topological_distance import TopologicalDistance  # noqa
            except ImportError:
                raise ValueError(
                    "ucx_net_devices set to 'auto' but UCX-Py is not "
                    "installed or it's compiled without hwloc support")
        elif ucx_net_devices == "":
            raise ValueError("ucx_net_devices can not be an empty string")
        self.ucx_net_devices = ucx_net_devices
        self.set_ucx_net_devices = enable_infiniband
        self.host = kwargs.get("host", None)

        super().__init__(
            n_workers=0,
            threads_per_worker=threads_per_worker,
            memory_limit=self.host_memory_limit,
            processes=True,
            data=data,
            local_directory=local_directory,
            protocol=protocol,
            config={
                "ucx":
                get_ucx_config(
                    enable_tcp_over_ucx=enable_tcp_over_ucx,
                    enable_nvlink=enable_nvlink,
                    enable_infiniband=enable_infiniband,
                    enable_rdmacm=enable_rdmacm,
                )
            },
            **kwargs,
        )

        self.new_spec["options"]["preload"] = self.new_spec["options"].get(
            "preload", []) + ["dask_cuda.initialize"]
        self.new_spec[
            "options"]["preload_argv"] = self.new_spec["options"].get(
                "preload_argv", []) + ["--create-cuda-context"]

        self.cuda_visible_devices = CUDA_VISIBLE_DEVICES
        self.scale(n_workers)
        self.sync(self._correct_state)
Example #13
0
def timeseries(
    fixed_totalsize=False,
    chunk_per_worker=10,
    chunk_size='128 MB',
    num_nodes=1,
    worker_per_node=1,
    chunking_scheme=None,
    io_format=None,
    store_scheme=None,
    # lat=320,
    # lon=384,
    lat=2560,
    lon=3840,
    start='1980-01-01',
    freq='1H',
    nan=False,
    # fs=None,
    # root='.',
):
    """ Create synthetic Xarray dataset filled with random
    data.

    Parameters
    ----------
    chunk_per_worker : int
          number of chunk placed per worker.
          see docs.dask.org, best practices, for chunk.
          Best chunk size is around 100M but, each worker can
          have many chunk, which automate the parallelism in dask.
    chunk_size : str
          chunk size in bytes, kilo, mega or any factor of bytes
    num_nodes : int
           number of compute nodes
    worker_per_node: int
           number of dask workers per node

    chunking_scheme : str
           Whether to chunk across time dimension ('temporal') or
           horizontal dimensions (lat, lon) ('spatial').
           If None, automatically determine chunk sizes along all dimensions.

    lat : int
         number of latitude values

    lon : int
         number of longitude values

    start : datetime (or datetime-like string)
        Start of time series

    freq : string
        String like '2s' or '1H' or '12W' for the time series frequency
    nan : bool
         Whether to include nan in generated data


    Examples
    ---------

    >>> from benchmarks.datasets import timeseries
    >>> ds = timeseries('128MB', 5, chunking_scheme='spatial', lat=500, lon=600)
    >>> ds
    <xarray.Dataset>
    Dimensions:  (lat: 500, lon: 600, time: 267)
    Coordinates:
    * time     (time) datetime64[ns] 1980-01-01 1980-01-02 ... 1980-09-23
    * lon      (lon) float64 -180.0 -179.4 -178.8 -178.2 ... 178.8 179.4 180.0
    * lat      (lat) float64 -90.0 -89.64 -89.28 -88.92 ... 88.92 89.28 89.64 90.0
    Data variables:
        sst      (time, lon, lat) float64 dask.array<shape=(267, 600, 500), .....
    Attributes:
        history:  created for compute benchmarking
    """

    dt = np.dtype('f8')
    itemsize = dt.itemsize
    chunk_size = parse_bytes(chunk_size)
    total_bytes = chunk_size * num_nodes * worker_per_node * chunk_per_worker
    size = total_bytes / itemsize
    print(size)
    timesteps = math.ceil(size / (lat * lon))
    print(timesteps)
    shape = (timesteps, lon, lat)
    if chunking_scheme == 'temporal':
        x = math.ceil(chunk_size / (lon * lat * itemsize))
        chunks = (x, lon, lat)
    elif chunking_scheme == 'spatial':
        x = math.ceil(math.sqrt(chunk_size / (timesteps * itemsize)))
        chunks = (timesteps, x, x)
    else:
        chunks = 'auto'

    lats = xr.DataArray(np.linspace(start=-90, stop=90, num=lat), dims=['lat'])
    lons = xr.DataArray(np.linspace(start=-180, stop=180, num=lon),
                        dims=['lon'])
    times = xr.DataArray(pd.date_range(start=start,
                                       freq=freq,
                                       periods=timesteps),
                         dims=['time'])
    if chunks == 'auto':
        with dask.config.set({'array.chunk-size': chunk_size}):
            random_data = randn(shape=shape, chunks=chunks, nan=nan)
    else:
        random_data = randn(shape=shape, chunks=chunks, nan=nan)
    ds = xr.DataArray(
        random_data,
        dims=['time', 'lon', 'lat'],
        coords={
            'time': times,
            'lon': lons,
            'lat': lats
        },
        name='sst',
        # encoding=None,
        attrs={
            'units': 'baz units',
            'description': 'a description',
            'history': 'created for compute benchmarking',
        },
    ).to_dataset()
    return ds, chunks
Example #14
0
    def __init__(
        self,
        scheduler=None,
        name=None,
        cores=None,
        memory=None,
        processes=None,
        nanny=True,
        interface=None,
        death_timeout=None,
        local_directory=None,
        extra=None,
        env_extra=None,
        header_skip=None,
        log_directory=None,
        shebang=None,
        python=sys.executable,
        job_name=None,
        config_name=None,
        **kwargs
    ):
        self.scheduler = scheduler
        self.job_id = None

        super().__init__()

        if config_name is None:
            config_name = getattr(type(self), "config_name")
        if config_name is None:
            raise ValueError(
                "Looks like you are trying to create a class that inherits from dask_jobqueue.core.Job. "
                "If that is the case, you need to:\n"
                "- set the 'config_name' class variable to a non-None value\n"
                "- create a section in jobqueue.yaml with the value of 'config_name'\n"
                "If that is not the case, please open an issue in https://github.com/dask/dask-jobqueue/issues."
            )

        if job_name is None:
            job_name = dask.config.get("jobqueue.%s.name" % config_name)
        if cores is None:
            cores = dask.config.get("jobqueue.%s.cores" % config_name)
        if memory is None:
            memory = dask.config.get("jobqueue.%s.memory" % config_name)
        if processes is None:
            processes = dask.config.get("jobqueue.%s.processes" % config_name)
        if interface is None:
            interface = dask.config.get("jobqueue.%s.interface" % config_name)
        if death_timeout is None:
            death_timeout = dask.config.get("jobqueue.%s.death-timeout" % config_name)
        if local_directory is None:
            local_directory = dask.config.get(
                "jobqueue.%s.local-directory" % config_name
            )
        if extra is None:
            extra = dask.config.get("jobqueue.%s.extra" % config_name)
        if env_extra is None:
            env_extra = dask.config.get("jobqueue.%s.env-extra" % config_name)
        if header_skip is None:
            header_skip = dask.config.get("jobqueue.%s.header-skip" % config_name, ())
        if log_directory is None:
            log_directory = dask.config.get("jobqueue.%s.log-directory" % config_name)
        if shebang is None:
            shebang = dask.config.get("jobqueue.%s.shebang" % config_name)

        if cores is None or memory is None:
            raise ValueError(
                "You must specify how much cores and memory per job you want to use, for example:\n"
                "cluster = {}(cores={}, memory={!r})".format(
                    self.__class__.__name__, cores or 8, memory or "24GB"
                )
            )

        # This attribute should be overridden
        self.job_header = None

        if interface:
            extra = extra + ["--interface", interface]
            kwargs.setdefault("host", get_ip_interface(interface))
        else:
            kwargs.setdefault("host", "")

        # Keep information on process, cores, and memory, for use in subclasses
        self.worker_memory = parse_bytes(memory) if memory is not None else None
        self.worker_processes = processes
        self.worker_cores = cores
        self.name = name
        self.job_name = job_name

        self.shebang = shebang

        self._env_header = "\n".join(filter(None, env_extra))
        self.header_skip = set(header_skip)

        # dask-worker command line build
        dask_worker_command = "%(python)s -m distributed.cli.dask_worker" % dict(
            python=python
        )
        command_args = [dask_worker_command, self.scheduler]
        command_args += ["--nthreads", self.worker_process_threads]
        if processes is not None and processes > 1:
            command_args += ["--nprocs", processes]

        command_args += ["--memory-limit", self.worker_process_memory]
        command_args += ["--name", str(name)]
        command_args += ["--nanny" if nanny else "--no-nanny"]

        if death_timeout is not None:
            command_args += ["--death-timeout", death_timeout]
        if local_directory is not None:
            command_args += ["--local-directory", local_directory]
        if extra is not None:
            command_args += extra

        self._command_template = " ".join(map(str, command_args))

        self.log_directory = log_directory
        if self.log_directory is not None:
            if not os.path.exists(self.log_directory):
                os.makedirs(self.log_directory)
Example #15
0
 def _get_nb_workers_from_memory(self, memory):
     return math.ceil(
         parse_bytes(memory) / parse_bytes(self.worker_spec['memory']))
Example #16
0
def main(
    scheduler,
    host,
    nthreads,
    name,
    memory_limit,
    device_memory_limit,
    rmm_pool_size,
    pid_file,
    resources,
    dashboard,
    dashboard_address,
    local_directory,
    scheduler_file,
    interface,
    death_timeout,
    preload,
    dashboard_prefix,
    tls_ca_file,
    tls_cert,
    tls_key,
    enable_tcp_over_ucx,
    enable_infiniband,
    enable_nvlink,
    enable_rdmacm,
    net_devices,
    **kwargs,
):
    enable_proctitle_on_current()
    enable_proctitle_on_children()

    if tls_ca_file and tls_cert and tls_key:
        sec = Security(
            tls_ca_file=tls_ca_file, tls_worker_cert=tls_cert, tls_worker_key=tls_key
        )
    else:
        sec = None

    try:
        nprocs = len(os.environ["CUDA_VISIBLE_DEVICES"].split(","))
    except KeyError:
        nprocs = get_n_gpus()

    if not nthreads:
        nthreads = min(1, multiprocessing.cpu_count() // nprocs)

    memory_limit = parse_memory_limit(memory_limit, nthreads, total_cores=nprocs)

    if pid_file:
        with open(pid_file, "w") as f:
            f.write(str(os.getpid()))

        def del_pid_file():
            if os.path.exists(pid_file):
                os.remove(pid_file)

        atexit.register(del_pid_file)

    services = {}

    if dashboard:
        try:
            from distributed.dashboard import BokehWorker
        except ImportError:
            pass
        else:
            if dashboard_prefix:
                result = (BokehWorker, {"prefix": dashboard_prefix})
            else:
                result = BokehWorker
            services[("dashboard", dashboard_address)] = result

    if resources:
        resources = resources.replace(",", " ").split()
        resources = dict(pair.split("=") for pair in resources)
        resources = valmap(float, resources)
    else:
        resources = None

    loop = IOLoop.current()

    preload_argv = kwargs.get("preload_argv", [])
    kwargs = {"worker_port": None, "listen_address": None}
    t = Nanny

    if not scheduler and not scheduler_file and "scheduler-address" not in config:
        raise ValueError(
            "Need to provide scheduler address like\n"
            "dask-worker SCHEDULER_ADDRESS:8786"
        )

    if interface:
        if host:
            raise ValueError("Can not specify both interface and host")
        else:
            host = get_ip_interface(interface)

    if rmm_pool_size is not None:
        try:
            import rmm  # noqa F401
        except ImportError:
            raise ValueError(
                "RMM pool requested but module 'rmm' is not available. "
                "For installation instructions, please see "
                "https://github.com/rapidsai/rmm"
            )  # pragma: no cover
        rmm_pool_size = parse_bytes(rmm_pool_size)

    nannies = [
        t(
            scheduler,
            scheduler_file=scheduler_file,
            nthreads=nthreads,
            services=services,
            loop=loop,
            resources=resources,
            memory_limit=memory_limit,
            interface=get_ucx_net_devices(
                cuda_device_index=i,
                ucx_net_devices=net_devices,
                get_openfabrics=False,
                get_network=True,
            ),
            preload=(list(preload) or []) + ["dask_cuda.initialize"],
            preload_argv=(list(preload_argv) or []) + ["--create-cuda-context"],
            security=sec,
            env={"CUDA_VISIBLE_DEVICES": cuda_visible_devices(i)},
            plugins={CPUAffinity(get_cpu_affinity(i)), RMMPool(rmm_pool_size)},
            name=name if nprocs == 1 or not name else name + "-" + str(i),
            local_directory=local_directory,
            config={
                "ucx": get_ucx_config(
                    enable_tcp_over_ucx=enable_tcp_over_ucx,
                    enable_infiniband=enable_infiniband,
                    enable_nvlink=enable_nvlink,
                    enable_rdmacm=enable_rdmacm,
                    net_devices=net_devices,
                    cuda_device_index=i,
                )
            },
            data=(
                DeviceHostFile,
                {
                    "device_memory_limit": get_device_total_memory(index=i)
                    if (device_memory_limit == "auto" or device_memory_limit == int(0))
                    else parse_bytes(device_memory_limit),
                    "memory_limit": memory_limit,
                    "local_directory": local_directory,
                },
            ),
            **kwargs,
        )
        for i in range(nprocs)
    ]

    @gen.coroutine
    def close_all():
        # Unregister all workers from scheduler
        yield [n._close(timeout=2) for n in nannies]

    def on_signal(signum):
        logger.info("Exiting on signal %d", signum)
        close_all()

    @gen.coroutine
    def run():
        yield nannies
        yield [n.finished() for n in nannies]

    install_signal_handlers(loop, cleanup=on_signal)

    try:
        loop.run_sync(run)
    except (KeyboardInterrupt, TimeoutError):
        pass
    finally:
        logger.info("End worker")
Example #17
0
    def __init__(self,
                 name=dask.config.get('jobqueue.name'),
                 threads=dask.config.get('jobqueue.threads'),
                 processes=dask.config.get('jobqueue.processes'),
                 memory=dask.config.get('jobqueue.memory'),
                 interface=dask.config.get('jobqueue.interface'),
                 death_timeout=dask.config.get('jobqueue.death-timeout'),
                 local_directory=dask.config.get('jobqueue.local-directory'),
                 extra=dask.config.get('jobqueue.extra'),
                 env_extra=dask.config.get('jobqueue.env-extra'),
                 **kwargs):
        """ """
        # """
        # This initializer should be considered as Abstract, and never used
        # directly.
        # """
        if not self.cancel_command or not self.submit_command:
            raise NotImplementedError('JobQueueCluster is an abstract class '
                                      'that should not be instanciated.')

        #This attribute should be overriden
        self.job_header = None

        if interface:
            host = get_ip_interface(interface)
            extra += ' --interface  %s ' % interface
        else:
            host = socket.gethostname()

        self.cluster = LocalCluster(n_workers=0, ip=host, **kwargs)

        # Keep information on process, threads and memory, for use in
        # subclasses
        self.worker_memory = parse_bytes(
            memory) if memory is not None else None
        self.worker_processes = processes
        self.worker_threads = threads
        self.name = name

        self.jobs = dict()
        self.n = 0
        self._adaptive = None

        self._env_header = '\n'.join(env_extra)

        # dask-worker command line build
        dask_worker_command = ('%(python)s -m distributed.cli.dask_worker' %
                               dict(python=sys.executable))
        self._command_template = ' '.join(
            [dask_worker_command, self.scheduler.address])
        if threads is not None:
            self._command_template += " --nthreads %d" % threads
        if processes is not None:
            self._command_template += " --nprocs %d" % processes
        if memory is not None:
            self._command_template += " --memory-limit %s" % memory
        if name is not None:
            self._command_template += " --name %s" % name
            self._command_template += "-%(n)d"  # Keep %(n) to be replaced later
        if death_timeout is not None:
            self._command_template += " --death-timeout %s" % death_timeout
        if local_directory is not None:
            self._command_template += " --local-directory %s" % local_directory
        if extra is not None:
            self._command_template += extra
Example #18
0
    def __init__(self,
                 name=None,
                 cores=None,
                 memory=None,
                 processes=None,
                 interface=None,
                 death_timeout=None,
                 local_directory=None,
                 extra=None,
                 env_extra=None,
                 walltime=None,
                 threads=None,
                 **kwargs
                 ):
        """ """
        # """
        # This initializer should be considered as Abstract, and never used
        # directly.
        # """
        if threads is not None:
            raise ValueError(threads_deprecation_message)

        if not self.scheduler_name:
            raise NotImplementedError('JobQueueCluster is an abstract class '
                                      'that should not be instanciated.')

        if name is None:
            name = dask.config.get('jobqueue.%s.name' % self.scheduler_name)
        if cores is None:
            cores = dask.config.get('jobqueue.%s.cores' % self.scheduler_name)
        if memory is None:
            memory = dask.config.get('jobqueue.%s.memory' % self.scheduler_name)
        if processes is None:
            processes = dask.config.get('jobqueue.%s.processes' % self.scheduler_name)
        if interface is None:
            interface = dask.config.get('jobqueue.%s.interface' % self.scheduler_name)
        if death_timeout is None:
            death_timeout = dask.config.get('jobqueue.%s.death-timeout' % self.scheduler_name)
        if local_directory is None:
            local_directory = dask.config.get('jobqueue.%s.local-directory' % self.scheduler_name)
        if extra is None:
            extra = dask.config.get('jobqueue.%s.extra' % self.scheduler_name)
        if env_extra is None:
            env_extra = dask.config.get('jobqueue.%s.env-extra' % self.scheduler_name)

        if dask.config.get('jobqueue.%s.threads', None):
            warnings.warn(threads_deprecation_message)

        if cores is None:
            raise ValueError("You must specify how many cores to use per job "
                             "like ``cores=8``")

        if memory is None:
            raise ValueError("You must specify how much memory to use per job "
                             "like ``memory='24 GB'``")

        #This attribute should be overriden
        self.job_header = None

        if interface:
            host = get_ip_interface(interface)
            extra += ' --interface  %s ' % interface
        else:
            host = socket.gethostname()

        self.local_cluster = LocalCluster(n_workers=0, ip=host, **kwargs)

        # Keep information on process, cores, and memory, for use in subclasses
        self.worker_memory = parse_bytes(memory)

        self.worker_processes = processes
        self.worker_cores = cores
        self.name = name

        self.jobs = dict()
        self.n = 0
        self._adaptive = None

        self._env_header = '\n'.join(env_extra)

        # dask-worker command line build
        dask_worker_command = (
            '%(python)s -m distributed.cli.dask_worker' % dict(python=sys.executable))
        self._command_template = ' '.join([dask_worker_command, self.scheduler.address])
        self._command_template += " --nthreads %d" % self.worker_threads
        if processes is not None and processes > 1:
            self._command_template += " --nprocs %d" % processes

        mem = format_bytes(self.worker_memory / self.worker_processes)
        mem = mem.replace(' ', '')
        self._command_template += " --memory-limit %s" % mem

        if name is not None:
            self._command_template += " --name %s" % name
            self._command_template += "-%(n)d" # Keep %(n) to be replaced later
        if death_timeout is not None:
            self._command_template += " --death-timeout %s" % death_timeout
        if local_directory is not None:
            self._command_template += " --local-directory %s" % local_directory
        if extra is not None:
            self._command_template += extra
Example #19
0
    def __init__(self,
                 name='dask-worker',
                 threads=2,
                 processes=4,
                 memory='8GB',
                 interface=None,
                 death_timeout=60,
                 local_directory=None,
                 extra='',
                 env_extra=[],
                 **kwargs):
        """ """
        # """
        # This initializer should be considered as Abstract, and never used
        # directly.
        # """
        if not self.cancel_command or not self.submit_command:
            raise NotImplementedError('JobQueueCluster is an abstract class '
                                      'that should not be instanciated.')

        #This attribute should be overriden
        self.job_header = None

        if interface:
            host = get_ip_interface(interface)
            extra += ' --interface  %s ' % interface
        else:
            host = socket.gethostname()

        self.cluster = LocalCluster(n_workers=0, ip=host, **kwargs)

        # Keep information on process, threads and memory, for use in
        # subclasses
        self.worker_memory = parse_bytes(memory)
        self.worker_processes = processes
        self.worker_threads = threads
        self.name = name

        self.jobs = dict()
        self.n = 0
        self._adaptive = None

        self._env_header = '\n'.join(env_extra)

        # dask-worker command line build
        self._command_template = os.path.join(
            dirname, 'dask-worker %s' % self.scheduler.address)
        if threads is not None:
            self._command_template += " --nthreads %d" % threads
        if processes is not None:
            self._command_template += " --nprocs %d" % processes
        if memory is not None:
            self._command_template += " --memory-limit %s" % memory
        if name is not None:
            self._command_template += " --name %s" % name
            self._command_template += "-%(n)d"  # Keep %(n) to be replaced later
        if death_timeout is not None:
            self._command_template += " --death-timeout %s" % death_timeout
        if local_directory is not None:
            self._command_template += " --local-directory %s" % local_directory
        if extra is not None:
            self._command_template += extra
Example #20
0
    def __init__(
        self,
        n_workers=None,
        threads_per_worker=1,
        processes=True,
        memory_limit=None,
        device_memory_limit=None,
        CUDA_VISIBLE_DEVICES=None,
        data=None,
        local_dir=None,
        **kwargs,
    ):
        if n_workers is None:
            n_workers = get_n_gpus()
        if CUDA_VISIBLE_DEVICES is None:
            CUDA_VISIBLE_DEVICES = cuda_visible_devices(0)
        if isinstance(CUDA_VISIBLE_DEVICES, str):
            CUDA_VISIBLE_DEVICES = CUDA_VISIBLE_DEVICES.split(",")
        CUDA_VISIBLE_DEVICES = list(map(int, CUDA_VISIBLE_DEVICES))
        if memory_limit is None:
            memory_limit = TOTAL_MEMORY / n_workers
        self.host_memory_limit = memory_limit
        self.device_memory_limit = device_memory_limit

        if not processes:
            raise ValueError(
                "Processes are necessary in order to use multiple GPUs with Dask"
            )

        if self.device_memory_limit is None:
            self.device_memory_limit = get_device_total_memory(0)
        elif isinstance(self.device_memory_limit, str):
            self.device_memory_limit = parse_bytes(self.device_memory_limit)

        if data is None:
            data = (
                DeviceHostFile,
                {
                    "device_memory_limit":
                    self.device_memory_limit,
                    "memory_limit":
                    self.host_memory_limit,
                    "local_dir":
                    local_dir or dask.config.get("temporary-directory")
                    or os.getcwd(),
                },
            )

        super().__init__(
            n_workers=0,
            threads_per_worker=threads_per_worker,
            memory_limit=memory_limit,
            processes=True,
            data=data,
            local_dir=local_dir,
            **kwargs,
        )

        self.new_spec["options"]["preload"] = self.new_spec["options"].get(
            "preload", []) + ["dask_cuda.initialize_context"]

        self.cuda_visible_devices = CUDA_VISIBLE_DEVICES
        self.scale(n_workers)
        self.sync(self._correct_state)
Example #21
0
def main(
    scheduler,
    host,
    nthreads,
    name,
    memory_limit,
    device_memory_limit,
    pid_file,
    reconnect,
    resources,
    dashboard,
    dashboard_address,
    local_directory,
    scheduler_file,
    interface,
    death_timeout,
    preload,
    preload_argv,
    bokeh_prefix,
    tls_ca_file,
    tls_cert,
    tls_key,
):
    enable_proctitle_on_current()
    enable_proctitle_on_children()

    sec = Security(tls_ca_file=tls_ca_file,
                   tls_worker_cert=tls_cert,
                   tls_worker_key=tls_key)

    try:
        nprocs = len(os.environ["CUDA_VISIBLE_DEVICES"].split(","))
    except KeyError:
        nprocs = get_n_gpus()

    if not nthreads:
        nthreads = min(1, multiprocessing.cpu_count() // nprocs)

    if pid_file:
        with open(pid_file, "w") as f:
            f.write(str(os.getpid()))

        def del_pid_file():
            if os.path.exists(pid_file):
                os.remove(pid_file)

        atexit.register(del_pid_file)

    services = {}

    if dashboard:
        try:
            from distributed.dashboard import BokehWorker
        except ImportError:
            pass
        else:
            if bokeh_prefix:
                result = (BokehWorker, {"prefix": bokeh_prefix})
            else:
                result = BokehWorker
            services[("dashboard", dashboard_address)] = result

    if resources:
        resources = resources.replace(",", " ").split()
        resources = dict(pair.split("=") for pair in resources)
        resources = valmap(float, resources)
    else:
        resources = None

    loop = IOLoop.current()

    kwargs = {"worker_port": None, "listen_address": None}
    t = Nanny

    if not scheduler and not scheduler_file and "scheduler-address" not in config:
        raise ValueError("Need to provide scheduler address like\n"
                         "dask-worker SCHEDULER_ADDRESS:8786")

    if interface:
        if host:
            raise ValueError("Can not specify both interface and host")
        else:
            host = get_ip_interface(interface)

    if host:
        addr = uri_from_host_port(host, 0, 0)
    else:
        # Choose appropriate address for scheduler
        addr = None

    if death_timeout is not None:
        death_timeout = parse_timedelta(death_timeout, "s")

    local_dir = kwargs.get("local_dir", "dask-worker-space")
    with warn_on_duration(
            "1s",
            "Creating scratch directories is taking a surprisingly long time. "
            "This is often due to running workers on a network file system. "
            "Consider specifying a local-directory to point workers to write "
            "scratch data to a local disk.",
    ):
        _workspace = WorkSpace(os.path.abspath(local_dir))
        _workdir = _workspace.new_work_dir(prefix="worker-")
        local_dir = _workdir.dir_path

    nannies = [
        t(
            scheduler,
            scheduler_file=scheduler_file,
            nthreads=nthreads,
            services=services,
            loop=loop,
            resources=resources,
            memory_limit=memory_limit,
            reconnect=reconnect,
            local_dir=local_directory,
            death_timeout=death_timeout,
            preload=(preload or []) + ["dask_cuda.initialize_context"],
            preload_argv=preload_argv,
            security=sec,
            contact_address=None,
            env={"CUDA_VISIBLE_DEVICES": cuda_visible_devices(i)},
            name=name if nprocs == 1 or not name else name + "-" + str(i),
            data=(
                DeviceHostFile,
                {
                    "device_memory_limit":
                    get_device_total_memory(index=i) if
                    (device_memory_limit == "auto" or device_memory_limit
                     == int(0)) else parse_bytes(device_memory_limit),
                    "memory_limit":
                    parse_memory_limit(memory_limit,
                                       nthreads,
                                       total_cores=nprocs),
                    "local_dir":
                    local_dir,
                },
            ),
            **kwargs,
        ) for i in range(nprocs)
    ]

    @gen.coroutine
    def close_all():
        # Unregister all workers from scheduler
        yield [n._close(timeout=2) for n in nannies]

    def on_signal(signum):
        logger.info("Exiting on signal %d", signum)
        close_all()

    @gen.coroutine
    def run():
        yield [n._start(addr) for n in nannies]
        while all(n.status != "closed" for n in nannies):
            yield gen.sleep(0.2)

    install_signal_handlers(loop, cleanup=on_signal)

    try:
        loop.run_sync(run)
    except (KeyboardInterrupt, TimeoutError):
        pass
    finally:
        logger.info("End worker")
Example #22
0
def main(
    scheduler,
    host,
    nthreads,
    name,
    memory_limit,
    device_memory_limit,
    pid_file,
    resources,
    dashboard,
    dashboard_address,
    local_directory,
    scheduler_file,
    interface,
    death_timeout,
    preload,
    dashboard_prefix,
    tls_ca_file,
    tls_cert,
    tls_key,
    **kwargs,
):
    enable_proctitle_on_current()
    enable_proctitle_on_children()

    sec = Security(
        tls_ca_file=tls_ca_file, tls_worker_cert=tls_cert, tls_worker_key=tls_key
    )

    try:
        nprocs = len(os.environ["CUDA_VISIBLE_DEVICES"].split(","))
    except KeyError:
        nprocs = get_n_gpus()

    if not nthreads:
        nthreads = min(1, multiprocessing.cpu_count() // nprocs)

    if pid_file:
        with open(pid_file, "w") as f:
            f.write(str(os.getpid()))

        def del_pid_file():
            if os.path.exists(pid_file):
                os.remove(pid_file)

        atexit.register(del_pid_file)

    services = {}

    if dashboard:
        try:
            from distributed.dashboard import BokehWorker
        except ImportError:
            pass
        else:
            if dashboard_prefix:
                result = (BokehWorker, {"prefix": dashboard_prefix})
            else:
                result = BokehWorker
            services[("dashboard", dashboard_address)] = result

    if resources:
        resources = resources.replace(",", " ").split()
        resources = dict(pair.split("=") for pair in resources)
        resources = valmap(float, resources)
    else:
        resources = None

    loop = IOLoop.current()

    kwargs = {"worker_port": None, "listen_address": None}
    t = Nanny

    if not scheduler and not scheduler_file and "scheduler-address" not in config:
        raise ValueError(
            "Need to provide scheduler address like\n"
            "dask-worker SCHEDULER_ADDRESS:8786"
        )

    if interface:
        if host:
            raise ValueError("Can not specify both interface and host")
        else:
            host = get_ip_interface(interface)

    nannies = [
        t(
            scheduler,
            scheduler_file=scheduler_file,
            nthreads=nthreads,
            services=services,
            loop=loop,
            resources=resources,
            memory_limit=memory_limit,
            host=host,
            preload=(preload or []) + ["dask_cuda.initialize_context"],
            security=sec,
            env={"CUDA_VISIBLE_DEVICES": cuda_visible_devices(i)},
            name=name if nprocs == 1 or not name else name + "-" + str(i),
            data=(
                DeviceHostFile,
                {
                    "device_memory_limit": get_device_total_memory(index=i)
                    if (device_memory_limit == "auto" or device_memory_limit == int(0))
                    else parse_bytes(device_memory_limit),
                    "memory_limit": parse_memory_limit(
                        memory_limit, nthreads, total_cores=nprocs
                    ),
                    "local_directory": local_directory,
                },
            ),
            **kwargs,
        )
        for i in range(nprocs)
    ]

    @gen.coroutine
    def close_all():
        # Unregister all workers from scheduler
        yield [n._close(timeout=2) for n in nannies]

    def on_signal(signum):
        logger.info("Exiting on signal %d", signum)
        close_all()

    @gen.coroutine
    def run():
        yield nannies
        yield [n.finished() for n in nannies]

    install_signal_handlers(loop, cleanup=on_signal)

    try:
        loop.run_sync(run)
    except (KeyboardInterrupt, TimeoutError):
        pass
    finally:
        logger.info("End worker")
Example #23
0
    def __init__(
        self,
        scheduler=None,
        name=None,
        cores=None,
        memory=None,
        processes=None,
        nanny=True,
        protocol=None,
        security=None,
        interface=None,
        death_timeout=None,
        local_directory=None,
        extra=None,
        env_extra=None,
        header_skip=None,
        log_directory=None,
        shebang=None,
        python=sys.executable,
        job_name=None,
        config_name=None,
    ):
        self.scheduler = scheduler
        self.job_id = None

        super().__init__()

        default_config_name = self.default_config_name()
        if config_name is None:
            config_name = default_config_name
        self.config_name = config_name

        if cores is None:
            cores = dask.config.get("jobqueue.%s.cores" % self.config_name)
        if memory is None:
            memory = dask.config.get("jobqueue.%s.memory" % self.config_name)

        if cores is None or memory is None:
            job_class_name = self.__class__.__name__
            cluster_class_name = job_class_name.replace("Job", "Cluster")
            raise ValueError(
                "You must specify how much cores and memory per job you want to use, for example:\n"
                "cluster = {}(cores={}, memory={!r})".format(
                    cluster_class_name, cores or 8, memory or "24GB"
                )
            )

        if job_name is None:
            job_name = dask.config.get("jobqueue.%s.name" % self.config_name)
        if processes is None:
            processes = dask.config.get("jobqueue.%s.processes" % self.config_name)
            if processes is None:
                processes, _ = nprocesses_nthreads(cores)
        if interface is None:
            interface = dask.config.get("jobqueue.%s.interface" % self.config_name)
        if death_timeout is None:
            death_timeout = dask.config.get(
                "jobqueue.%s.death-timeout" % self.config_name
            )
        if local_directory is None:
            local_directory = dask.config.get(
                "jobqueue.%s.local-directory" % self.config_name
            )
        if extra is None:
            extra = dask.config.get("jobqueue.%s.extra" % self.config_name)
        if env_extra is None:
            env_extra = dask.config.get("jobqueue.%s.env-extra" % self.config_name)
        if header_skip is None:
            header_skip = dask.config.get(
                "jobqueue.%s.header-skip" % self.config_name, ()
            )
        if log_directory is None:
            log_directory = dask.config.get(
                "jobqueue.%s.log-directory" % self.config_name
            )
        if shebang is None:
            shebang = dask.config.get("jobqueue.%s.shebang" % self.config_name)

        # This attribute should be set in the derived class
        self.job_header = None

        if interface:
            extra = extra + ["--interface", interface]
        if protocol:
            extra = extra + ["--protocol", protocol]
        if security:
            worker_security_dict = security.get_tls_config_for_role("worker")
            security_command_line_list = [
                ["--tls-" + key.replace("_", "-"), value]
                for key, value in worker_security_dict.items()
                # 'ciphers' parameter does not have a command-line equivalent
                if key != "ciphers"
            ]
            security_command_line = sum(security_command_line_list, [])
            extra = extra + security_command_line

        # Keep information on process, cores, and memory, for use in subclasses
        self.worker_memory = parse_bytes(memory) if memory is not None else None
        self.worker_processes = processes
        self.worker_cores = cores
        self.name = name
        self.job_name = job_name

        self.shebang = shebang

        self._env_header = "\n".join(filter(None, env_extra))
        self.header_skip = set(header_skip)

        # dask-worker command line build
        dask_worker_command = "%(python)s -m distributed.cli.dask_worker" % dict(
            python=python
        )
        command_args = [dask_worker_command, self.scheduler]
        command_args += ["--nthreads", self.worker_process_threads]
        if processes is not None and processes > 1:
            command_args += ["--nprocs", processes]

        command_args += ["--memory-limit", self.worker_process_memory]
        command_args += ["--name", str(name)]
        command_args += ["--nanny" if nanny else "--no-nanny"]

        if death_timeout is not None:
            command_args += ["--death-timeout", death_timeout]
        if local_directory is not None:
            command_args += ["--local-directory", local_directory]
        if extra is not None:
            command_args += extra

        self._command_template = " ".join(map(str, command_args))

        self.log_directory = log_directory
        if self.log_directory is not None:
            if not os.path.exists(self.log_directory):
                os.makedirs(self.log_directory)
Example #24
0
    def __init__(
        self,
        scheduler=None,
        host=None,
        nthreads=0,
        name=None,
        memory_limit="auto",
        device_memory_limit="auto",
        rmm_pool_size=None,
        rmm_managed_memory=False,
        pid_file=None,
        resources=None,
        dashboard=True,
        dashboard_address=":0",
        local_directory=None,
        scheduler_file=None,
        interface=None,
        death_timeout=None,
        preload=[],
        dashboard_prefix=None,
        security=None,
        enable_tcp_over_ucx=False,
        enable_infiniband=False,
        enable_nvlink=False,
        enable_rdmacm=False,
        net_devices=None,
        **kwargs,
    ):
        # Required by RAPIDS libraries (e.g., cuDF) to ensure no context
        # initialization happens before we can set CUDA_VISIBLE_DEVICES
        os.environ["RAPIDS_NO_INITIALIZE"] = "True"

        enable_proctitle_on_current()
        enable_proctitle_on_children()

        try:
            nprocs = len(os.environ["CUDA_VISIBLE_DEVICES"].split(","))
        except KeyError:
            nprocs = get_n_gpus()

        if not nthreads:
            nthreads = min(1, multiprocessing.cpu_count() // nprocs)

        memory_limit = parse_memory_limit(memory_limit, nthreads, total_cores=nprocs)

        if pid_file:
            with open(pid_file, "w") as f:
                f.write(str(os.getpid()))

            def del_pid_file():
                if os.path.exists(pid_file):
                    os.remove(pid_file)

            atexit.register(del_pid_file)

        services = {}

        if dashboard:
            try:
                from distributed.dashboard import BokehWorker
            except ImportError:
                pass
            else:
                if dashboard_prefix:
                    result = (BokehWorker, {"prefix": dashboard_prefix})
                else:
                    result = BokehWorker
                services[("dashboard", dashboard_address)] = result

        if resources:
            resources = resources.replace(",", " ").split()
            resources = dict(pair.split("=") for pair in resources)
            resources = valmap(float, resources)
        else:
            resources = None

        loop = IOLoop.current()

        preload_argv = kwargs.get("preload_argv", [])
        kwargs = {"worker_port": None, "listen_address": None}
        t = Nanny

        if (
            not scheduler
            and not scheduler_file
            and dask.config.get("scheduler-address", None) is None
        ):
            raise ValueError(
                "Need to provide scheduler address like\n"
                "dask-worker SCHEDULER_ADDRESS:8786"
            )

        if interface and host:
            raise ValueError("Can not specify both interface and host")

        if rmm_pool_size is not None or rmm_managed_memory:
            try:
                import rmm  # noqa F401
            except ImportError:
                raise ValueError(
                    "RMM pool requested but module 'rmm' is not available. "
                    "For installation instructions, please see "
                    "https://github.com/rapidsai/rmm"
                )  # pragma: no cover
            if rmm_pool_size is not None:
                rmm_pool_size = parse_bytes(rmm_pool_size)
        else:
            if enable_nvlink:
                warnings.warn(
                    "When using NVLink we recommend setting a "
                    "`rmm_pool_size`.  Please see: "
                    "https://dask-cuda.readthedocs.io/en/latest/ucx.html"
                    "#important-notes for more details"
                )

        if enable_nvlink and rmm_managed_memory:
            raise ValueError(
                "RMM managed memory and NVLink are currently incompatible."
            )

        # Ensure this parent dask-cuda-worker process uses the same UCX
        # configuration as child worker processes created by it.
        initialize(
            create_cuda_context=False,
            enable_tcp_over_ucx=enable_tcp_over_ucx,
            enable_infiniband=enable_infiniband,
            enable_nvlink=enable_nvlink,
            enable_rdmacm=enable_rdmacm,
            net_devices=net_devices,
            cuda_device_index=0,
        )

        self.nannies = [
            t(
                scheduler,
                scheduler_file=scheduler_file,
                nthreads=nthreads,
                services=services,
                loop=loop,
                resources=resources,
                memory_limit=memory_limit,
                interface=_get_interface(interface, host, i, net_devices),
                host=host,
                preload=(list(preload) or []) + ["dask_cuda.initialize"],
                preload_argv=(list(preload_argv) or []) + ["--create-cuda-context"],
                security=security,
                env={"CUDA_VISIBLE_DEVICES": cuda_visible_devices(i)},
                plugins={
                    CPUAffinity(get_cpu_affinity(i)),
                    RMMSetup(rmm_pool_size, rmm_managed_memory),
                },
                name=name if nprocs == 1 or not name else name + "-" + str(i),
                local_directory=local_directory,
                config={
                    "ucx": get_ucx_config(
                        enable_tcp_over_ucx=enable_tcp_over_ucx,
                        enable_infiniband=enable_infiniband,
                        enable_nvlink=enable_nvlink,
                        enable_rdmacm=enable_rdmacm,
                        net_devices=net_devices,
                        cuda_device_index=i,
                    )
                },
                data=(
                    DeviceHostFile,
                    {
                        "device_memory_limit": parse_device_memory_limit(
                            device_memory_limit, device_index=i
                        ),
                        "memory_limit": memory_limit,
                        "local_directory": local_directory,
                    },
                ),
                **kwargs,
            )
            for i in range(nprocs)
        ]
Example #25
0
    def __init__(self,
                 name=None,
                 cores=None,
                 memory=None,
                 processes=None,
                 interface=None,
                 death_timeout=None,
                 local_directory=None,
                 extra=None,
                 env_extra=None,
                 log_directory=None,
                 walltime=None,
                 threads=None,
                 python=sys.executable,
                 **kwargs):
        """ """
        # """
        # This initializer should be considered as Abstract, and never used directly.
        # """
        if threads is not None:
            raise ValueError(threads_deprecation_message)

        if not self.scheduler_name:
            raise NotImplementedError(
                'JobQueueCluster is an abstract class that should not be instanciated.'
            )

        if name is None:
            name = dask.config.get('jobqueue.%s.name' % self.scheduler_name)
        if cores is None:
            cores = dask.config.get('jobqueue.%s.cores' % self.scheduler_name)
        if memory is None:
            memory = dask.config.get('jobqueue.%s.memory' %
                                     self.scheduler_name)
        if processes is None:
            processes = dask.config.get('jobqueue.%s.processes' %
                                        self.scheduler_name)
        if interface is None:
            interface = dask.config.get('jobqueue.%s.interface' %
                                        self.scheduler_name)
        if death_timeout is None:
            death_timeout = dask.config.get('jobqueue.%s.death-timeout' %
                                            self.scheduler_name)
        if local_directory is None:
            local_directory = dask.config.get('jobqueue.%s.local-directory' %
                                              self.scheduler_name)
        if extra is None:
            extra = dask.config.get('jobqueue.%s.extra' % self.scheduler_name)
        if env_extra is None:
            env_extra = dask.config.get('jobqueue.%s.env-extra' %
                                        self.scheduler_name)
        if log_directory is None:
            log_directory = dask.config.get('jobqueue.%s.log-directory' %
                                            self.scheduler_name)

        if dask.config.get('jobqueue.%s.threads', None):
            warnings.warn(threads_deprecation_message)

        if cores is None:
            raise ValueError(
                "You must specify how many cores to use per job like ``cores=8``"
            )

        if memory is None:
            raise ValueError(
                "You must specify how much memory to use per job like ``memory='24 GB'``"
            )

        # This attribute should be overridden
        self.job_header = None

        if interface:
            extra += ['--interface', interface]
            kwargs.setdefault('ip', get_ip_interface(interface))
        else:
            kwargs.setdefault('ip', '')

        # Bokeh diagnostics server should listen on all interfaces
        diagnostics_ip_and_port = ('', 8787)
        self.local_cluster = LocalCluster(
            n_workers=0, diagnostics_port=diagnostics_ip_and_port, **kwargs)

        # Keep information on process, cores, and memory, for use in subclasses
        self.worker_memory = parse_bytes(
            memory) if memory is not None else None
        self.worker_processes = processes
        self.worker_cores = cores
        self.name = name

        # plugin for tracking job status
        self._scheduler_plugin = JobQueuePlugin()
        self.local_cluster.scheduler.add_plugin(self._scheduler_plugin)

        self._adaptive = None

        self._env_header = '\n'.join(env_extra)

        # dask-worker command line build
        dask_worker_command = '%(python)s -m distributed.cli.dask_worker' % dict(
            python=python)
        command_args = [dask_worker_command, self.scheduler.address]
        command_args += ['--nthreads', self.worker_threads]
        if processes is not None and processes > 1:
            command_args += ['--nprocs', processes]

        mem = format_bytes(self.worker_memory / self.worker_processes)
        command_args += ['--memory-limit', mem.replace(' ', '')]
        command_args += ['--name', '%s--${JOB_ID}--' % name]

        if death_timeout is not None:
            command_args += ['--death-timeout', death_timeout]
        if local_directory is not None:
            command_args += ['--local-directory', local_directory]
        if extra is not None:
            command_args += extra

        self._command_template = ' '.join(map(str, command_args))

        self._target_scale = 0

        self.log_directory = log_directory
        if self.log_directory is not None:
            if not os.path.exists(self.log_directory):
                os.makedirs(self.log_directory)
Example #26
0
    def __init__(self,
                 name=None,
                 cores=None,
                 memory=None,
                 processes=None,
                 interface=None,
                 death_timeout=None,
                 local_directory=None,
                 extra=None,
                 env_extra=None,
                 log_directory=None,
                 threads=None,
                 shebang=None,
                 python=sys.executable,
                 config_name=None,
                 **kwargs):
        """ """
        # """
        # This initializer should be considered as Abstract, and never used directly.
        # """
        super(JobQueueCluster, self).__init__()

        if threads is not None:
            raise ValueError(threads_deprecation_message)

        if config_name is None:
            raise NotImplementedError(
                "JobQueueCluster is an abstract class that should not be instantiated."
            )

        if name is None:
            name = dask.config.get("jobqueue.%s.name" % config_name)
        if cores is None:
            cores = dask.config.get("jobqueue.%s.cores" % config_name)
        if memory is None:
            memory = dask.config.get("jobqueue.%s.memory" % config_name)
        if processes is None:
            processes = dask.config.get("jobqueue.%s.processes" % config_name)
        if interface is None:
            interface = dask.config.get("jobqueue.%s.interface" % config_name)
        if death_timeout is None:
            death_timeout = dask.config.get("jobqueue.%s.death-timeout" %
                                            config_name)
        if local_directory is None:
            local_directory = dask.config.get("jobqueue.%s.local-directory" %
                                              config_name)
        if extra is None:
            extra = dask.config.get("jobqueue.%s.extra" % config_name)
        if env_extra is None:
            env_extra = dask.config.get("jobqueue.%s.env-extra" % config_name)
        if log_directory is None:
            log_directory = dask.config.get("jobqueue.%s.log-directory" %
                                            config_name)
        if shebang is None:
            shebang = dask.config.get("jobqueue.%s.shebang" % config_name)

        if dask.config.get("jobqueue.%s.threads", None):
            warnings.warn(threads_deprecation_message)

        if cores is None:
            raise ValueError(
                "You must specify how many cores to use per job like ``cores=8``"
            )

        if memory is None:
            raise ValueError(
                "You must specify how much memory to use per job like ``memory='24 GB'``"
            )

        # This attribute should be overridden
        self.job_header = None

        if interface:
            extra += ["--interface", interface]
            kwargs.setdefault("ip", get_ip_interface(interface))
        else:
            kwargs.setdefault("ip", "")

        # Bokeh diagnostics server should listen on all interfaces
        kwargs.setdefault("dashboard_address", ("", 8787))
        self.local_cluster = LocalCluster(n_workers=0, **kwargs)

        # Keep information on process, cores, and memory, for use in subclasses
        self.worker_memory = parse_bytes(
            memory) if memory is not None else None
        self.worker_processes = processes
        self.worker_cores = cores
        self.name = name

        # plugin for tracking job status
        self._scheduler_plugin = JobQueuePlugin()
        self.local_cluster.scheduler.add_plugin(self._scheduler_plugin)

        self._adaptive = None

        self.shebang = shebang

        self._env_header = "\n".join(env_extra)

        # dask-worker command line build
        dask_worker_command = "%(python)s -m distributed.cli.dask_worker" % dict(
            python=python)
        command_args = [dask_worker_command, self.scheduler.address]
        command_args += ["--nthreads", self.worker_process_threads]
        if processes is not None and processes > 1:
            command_args += ["--nprocs", processes]

        command_args += ["--memory-limit", self.worker_process_memory]
        command_args += ["--name", "%s--${JOB_ID}--" % name]

        if death_timeout is not None:
            command_args += ["--death-timeout", death_timeout]
        if local_directory is not None:
            command_args += ["--local-directory", local_directory]
        if extra is not None:
            command_args += extra

        self._command_template = " ".join(map(str, command_args))

        self.log_directory = log_directory
        if self.log_directory is not None:
            if not os.path.exists(self.log_directory):
                os.makedirs(self.log_directory)
Example #27
0
    def __init__(
        self,
        n_workers=None,
        threads_per_worker=1,
        processes=True,
        memory_limit="auto",
        device_memory_limit=0.8,
        CUDA_VISIBLE_DEVICES=None,
        data=None,
        local_directory=None,
        protocol=None,
        enable_tcp_over_ucx=False,
        enable_infiniband=False,
        enable_nvlink=False,
        enable_rdmacm=False,
        ucx_net_devices=None,
        rmm_pool_size=None,
        rmm_managed_memory=False,
        **kwargs,
    ):
        # Required by RAPIDS libraries (e.g., cuDF) to ensure no context
        # initialization happens before we can set CUDA_VISIBLE_DEVICES
        os.environ["RAPIDS_NO_INITIALIZE"] = "True"

        if CUDA_VISIBLE_DEVICES is None:
            CUDA_VISIBLE_DEVICES = cuda_visible_devices(0)
        if isinstance(CUDA_VISIBLE_DEVICES, str):
            CUDA_VISIBLE_DEVICES = CUDA_VISIBLE_DEVICES.split(",")
        CUDA_VISIBLE_DEVICES = list(
            map(parse_cuda_visible_device, CUDA_VISIBLE_DEVICES))
        if n_workers is None:
            n_workers = len(CUDA_VISIBLE_DEVICES)
        self.host_memory_limit = parse_memory_limit(memory_limit,
                                                    threads_per_worker,
                                                    n_workers)
        self.device_memory_limit = parse_device_memory_limit(
            device_memory_limit, device_index=0)

        self.rmm_pool_size = rmm_pool_size
        self.rmm_managed_memory = rmm_managed_memory
        if rmm_pool_size is not None or rmm_managed_memory:
            try:
                import rmm  # noqa F401
            except ImportError:
                raise ValueError(
                    "RMM pool or managed memory requested but module 'rmm' "
                    "is not available. For installation instructions, please "
                    "see https://github.com/rapidsai/rmm")  # pragma: no cover
            if self.rmm_pool_size is not None:
                self.rmm_pool_size = parse_bytes(self.rmm_pool_size)
        else:
            if enable_nvlink:
                warnings.warn(
                    "When using NVLink we recommend setting a "
                    "`rmm_pool_size`. Please see: "
                    "https://dask-cuda.readthedocs.io/en/latest/ucx.html"
                    "#important-notes for more details")

        if not processes:
            raise ValueError(
                "Processes are necessary in order to use multiple GPUs with Dask"
            )

        if data is None:
            data = (
                DeviceHostFile,
                {
                    "device_memory_limit":
                    self.device_memory_limit,
                    "memory_limit":
                    self.host_memory_limit,
                    "local_directory":
                    local_directory or dask.config.get("temporary-directory")
                    or os.getcwd(),
                },
            )

        if enable_tcp_over_ucx or enable_infiniband or enable_nvlink:
            if protocol is None:
                protocol = "ucx"
            elif protocol != "ucx":
                raise TypeError(
                    "Enabling InfiniBand or NVLink requires protocol='ucx'")

        if ucx_net_devices == "auto":
            try:
                from ucp._libs.topological_distance import TopologicalDistance  # NOQA
            except ImportError:
                raise ValueError(
                    "ucx_net_devices set to 'auto' but UCX-Py is not "
                    "installed or it's compiled without hwloc support")
        elif ucx_net_devices == "":
            raise ValueError("ucx_net_devices can not be an empty string")
        self.ucx_net_devices = ucx_net_devices
        self.set_ucx_net_devices = enable_infiniband
        self.host = kwargs.get("host", None)

        initialize(
            enable_tcp_over_ucx=enable_tcp_over_ucx,
            enable_nvlink=enable_nvlink,
            enable_infiniband=enable_infiniband,
            enable_rdmacm=enable_rdmacm,
            net_devices=ucx_net_devices,
            cuda_device_index=0,
        )

        super().__init__(
            n_workers=0,
            threads_per_worker=threads_per_worker,
            memory_limit=self.host_memory_limit,
            processes=True,
            data=data,
            local_directory=local_directory,
            protocol=protocol,
            config={
                "ucx":
                get_ucx_config(
                    enable_tcp_over_ucx=enable_tcp_over_ucx,
                    enable_nvlink=enable_nvlink,
                    enable_infiniband=enable_infiniband,
                    enable_rdmacm=enable_rdmacm,
                )
            },
            **kwargs,
        )

        self.new_spec["options"]["preload"] = self.new_spec["options"].get(
            "preload", []) + ["dask_cuda.initialize"]
        self.new_spec[
            "options"]["preload_argv"] = self.new_spec["options"].get(
                "preload_argv", []) + ["--create-cuda-context"]

        self.cuda_visible_devices = CUDA_VISIBLE_DEVICES
        self.scale(n_workers)
        self.sync(self._correct_state)
Example #28
0
def set_rmm():
    rmm.reinitialize(pool_allocator=True,
                     managed_memory=False,
                     initial_pool_size=parse_bytes("6GB"))
    cupy.cuda.set_allocator(rmm.rmm_cupy_allocator)
Example #29
0
def timeseries(
    chunk_size='128 MB',
    n_workers=1,
    chunk_over_time_dim=True,
    lat=320,
    lon=384,
    start='1980-01-01',
    freq='1D',
    nan=False,
):
    """ Create synthetic Xarray dataset filled with random
    data.

    Parameters
    ----------
    chunk_size : str
          chunk size in bytes, kilo, mega or any factor of bytes
    n_workers : int
           number of dask workers
    chunk_over_time_dim : bool, default True
           Whether to chunk across time dimension or horizontal dimensions (lat, lon)
    lat : int
         number of latitude values

    lon : int
         number of longitude values

    start : datetime (or datetime-like string)
        Start of time series

    freq : string
        String like '2s' or '1H' or '12W' for the time series frequency
    nan : bool
         Whether to include nan in generated data


    Examples
    ---------

    >>> from benchmarks.datasets import timeseries
    >>> ds = timeseries('128MB', 5, chunk_over_time_dim=False, lat=500, lon=600)
    >>> ds
    <xarray.Dataset>
    Dimensions:  (lat: 500, lon: 600, time: 267)
    Coordinates:
    * time     (time) datetime64[ns] 1980-01-01 1980-01-02 ... 1980-09-23
    * lon      (lon) float64 -180.0 -179.4 -178.8 -178.2 ... 178.8 179.4 180.0
    * lat      (lat) float64 -90.0 -89.64 -89.28 -88.92 ... 88.92 89.28 89.64 90.0
    Data variables:
        sst      (time, lon, lat) float64 dask.array<shape=(267, 600, 500), chunksize=(267, 245, 245)>
    Attributes:
        history:  created for compute benchmarking
    """

    dt = np.dtype('f8')
    itemsize = dt.itemsize
    chunk_size = parse_bytes(chunk_size)
    total_bytes = chunk_size * n_workers
    size = total_bytes / itemsize
    timesteps = math.ceil(size / (lat * lon))
    shape = (timesteps, lon, lat)
    if chunk_over_time_dim:
        x = math.ceil(chunk_size / (lon * lat * itemsize))
        chunks = (x, lon, lat)
    else:
        x = math.ceil(math.sqrt(chunk_size / (timesteps * itemsize)))
        chunks = (timesteps, x, x)

    lats = xr.DataArray(np.linspace(start=-90, stop=90, num=lat), dims=['lat'])
    lons = xr.DataArray(np.linspace(start=-180, stop=180, num=lon), dims=['lon'])
    times = xr.DataArray(pd.date_range(start=start, freq=freq, periods=timesteps), dims=['time'])
    random_data = randn(shape=shape, chunks=chunks, nan=nan)
    ds = xr.DataArray(
        random_data,
        dims=['time', 'lon', 'lat'],
        coords={'time': times, 'lon': lons, 'lat': lats},
        name='sst',
        encoding=None,
        attrs={'units': 'baz units', 'description': 'a description'},
    ).to_dataset()
    ds.attrs = {'history': 'created for compute benchmarking'}

    return ds
Example #30
0
    def __init__(
        self,
        n_workers=None,
        threads_per_worker=1,
        processes=True,
        memory_limit=None,
        device_memory_limit=None,
        CUDA_VISIBLE_DEVICES=None,
        data=None,
        local_directory=None,
        protocol=None,
        enable_tcp_over_ucx=False,
        enable_infiniband=False,
        enable_nvlink=False,
        ucx_net_devices=None,
        **kwargs,
    ):
        if CUDA_VISIBLE_DEVICES is None:
            CUDA_VISIBLE_DEVICES = cuda_visible_devices(0)
        if isinstance(CUDA_VISIBLE_DEVICES, str):
            CUDA_VISIBLE_DEVICES = CUDA_VISIBLE_DEVICES.split(",")
        CUDA_VISIBLE_DEVICES = list(map(int, CUDA_VISIBLE_DEVICES))
        if n_workers is None:
            n_workers = len(CUDA_VISIBLE_DEVICES)
        if memory_limit is None:
            memory_limit = MEMORY_LIMIT / n_workers
        self.host_memory_limit = memory_limit
        self.device_memory_limit = device_memory_limit

        if not processes:
            raise ValueError(
                "Processes are necessary in order to use multiple GPUs with Dask"
            )

        if self.device_memory_limit is None:
            self.device_memory_limit = get_device_total_memory(0)
        elif isinstance(self.device_memory_limit, str):
            self.device_memory_limit = parse_bytes(self.device_memory_limit)

        if data is None:
            data = (
                DeviceHostFile,
                {
                    "device_memory_limit":
                    self.device_memory_limit,
                    "memory_limit":
                    self.host_memory_limit,
                    "local_directory":
                    local_directory or dask.config.get("temporary-directory")
                    or os.getcwd(),
                },
            )

        if enable_tcp_over_ucx or enable_infiniband or enable_nvlink:
            if protocol is None:
                protocol = "ucx"
            elif protocol != "ucx":
                raise TypeError(
                    "Enabling InfiniBand or NVLink requires protocol='ucx'")

            initialize(
                enable_tcp_over_ucx=enable_tcp_over_ucx,
                enable_infiniband=enable_infiniband,
                enable_nvlink=enable_nvlink,
            )

        if ucx_net_devices == "auto":
            try:
                from ucp._libs.topological_distance import TopologicalDistance  # noqa
            except ImportError:
                raise ValueError(
                    "ucx_net_devices set to 'auto' but UCX-Py is not "
                    "installed or it's compiled without hwloc support")
        elif ucx_net_devices == "":
            raise ValueError("ucx_net_devices can not be an empty string")
        self.ucx_net_devices = ucx_net_devices
        self.set_ucx_net_devices = enable_infiniband

        super().__init__(
            n_workers=0,
            threads_per_worker=threads_per_worker,
            memory_limit=memory_limit,
            processes=True,
            data=data,
            local_directory=local_directory,
            protocol=protocol,
            **kwargs,
        )

        self.new_spec["options"]["preload"] = self.new_spec["options"].get(
            "preload", []) + ["dask_cuda.initialize"]
        self.new_spec[
            "options"]["preload_argv"] = self.new_spec["options"].get(
                "preload_argv", []) + ["--create-cuda-context"]

        self.cuda_visible_devices = CUDA_VISIBLE_DEVICES
        self.scale(n_workers)
        self.sync(self._correct_state)