def test_pause_executor(c, s, a):
    memory = psutil.Process().memory_info().rss
    a.memory_limit = memory / 0.5 + 200e6
    np = pytest.importorskip("numpy")

    def f():
        x = np.ones(int(400e6), dtype="u1")
        sleep(1)

    with captured_logger(logging.getLogger("distributed.worker")) as logger:
        future = c.submit(f)
        futures = c.map(slowinc, range(30), delay=0.1)

        start = time()
        while not a.paused:
            yield gen.sleep(0.01)
            assert time() < start + 4, (
                format_bytes(psutil.Process().memory_info().rss),
                format_bytes(a.memory_limit),
                len(a.data),
            )
        out = logger.getvalue()
        assert "memory" in out.lower()
        assert "pausing" in out.lower()

    assert sum(f.status == "finished" for f in futures) < 4

    yield wait(futures)
def print_ds_info(ds, var):
    """Function for printing chunking information"""
    dt = ds[var].dtype
    itemsize = dt.itemsize
    chunk_size = ds[var].data.chunksize
    size = format_bytes(ds.nbytes)
    _bytes = reduce(mul, chunk_size) * itemsize
    chunk_size_bytes = format_bytes(_bytes)

    print(f'Variable name: {var}')
    print(f'Dataset dimensions: {ds[var].dims}')
    print(f'Chunk shape: {chunk_size}')
    print(f'Dataset shape: {ds[var].shape}')
    print(f'Chunk size: {chunk_size_bytes}')
    print(f'Dataset size: {size}')
Exemple #3
0
def main(args=None):
    args = parse_args(args)

    if args.protocol == 'ucx':
        sched_str = "ucx://" + args.server + ":" + args.port
        client = Client(sched_str)
    else:
        kwargs = {'n_workers': 2, 'threads_per_worker': 40}
        kwargs['processes'] = args.protocol == 'tcp'
        cluster = LocalCluster(**kwargs)
        client = Client(cluster)

    print(f"Connected to {client}")
    N = 1_000_000
    P = 1_000
    X = da.random.uniform(size=(N, P), chunks=(N // 100, P))
    print(format_bytes(X.nbytes))

    result = X.T.dot(X)
    start = clock()
    result.compute()
    stop = clock()
    print(result)
    print(f"\tTook {stop - start:0.2f}s")
    time.sleep(10)
    def _widget_status(self):
        workers = len(self.scheduler.workers)
        cores = sum(ws.ncores for ws in self.scheduler.workers.values())
        memory = sum(ws.memory_limit for ws in self.scheduler.workers.values())
        memory = format_bytes(memory)
        text = """
<div>
  <style scoped>
    .dataframe tbody tr th:only-of-type {
        vertical-align: middle;
    }

    .dataframe tbody tr th {
        vertical-align: top;
    }

    .dataframe thead th {
        text-align: right;
    }
  </style>
  <table style="text-align: right;">
    <tr><th>Workers</th> <td>%d</td></tr>
    <tr><th>Cores</th> <td>%d</td></tr>
    <tr><th>Memory</th> <td>%s</td></tr>
  </table>
</div>
""" % (
            workers,
            cores,
            memory,
        )
        return text
Exemple #5
0
    def _widget_status(self):
        client = self._dask_client()

        workers = client.scheduler_info()['workers']

        n_workers = len(workers)
        cores = sum(w['nthreads'] for w in workers.values())
        memory = sum(w['memory_limit'] for w in workers.values())

        text = """
<div>
  <style scoped>
    .dataframe tbody tr th:only-of-type {
        vertical-align: middle;
    }

    .dataframe tbody tr th {
        vertical-align: top;
    }

    .dataframe thead th {
        text-align: right;
    }
  </style>
  <table style="text-align: right;">
    <tr><th>Workers</th> <td>%d</td></tr>
    <tr><th>Cores</th> <td>%d</td></tr>
    <tr><th>Memory</th> <td>%s</td></tr>
  </table>
</div>
""" % (n_workers, cores, format_bytes(memory))
        return text
Exemple #6
0
def main(args=None):
    args = parse_args(args)

    if args.protocol == 'ucx':
        address = dask.config.get("distributed.comm.ucxaddress")
        if address is None:
            raise ValueError("Set distributed.comm.ucxaddress")
        client = Client(address)
    else:
        kwargs = {'n_workers': 2, 'threads_per_worker': 40}
        kwargs['processes'] = args.protocol == 'tcp'
        cluster = LocalCluster(**kwargs)
        client = Client(cluster)

    print(f"Connected to {client}")
    N = 1_000_000
    P = 1_000
    X = da.random.uniform(size=(N, P), chunks=(N // 100, P))
    print(format_bytes(X.nbytes))

    result = X.T.dot(X)
    start = clock()
    result.compute()
    stop = clock()
    print(result)
    print(f"\tTook {stop - start:0.2f}s")
    time.sleep(10)
    def _widget_status(self):
        ### reporting proper number of nodes vs workers in a multi-GPU worker scenario
        nodes = len(self.scheduler_info["workers"])

        if self.use_gpu:
            nodes = int(nodes / self.n_gpus_per_node)
        if hasattr(self, "worker_spec"):
            requested = sum(
                1 if "group" not in each else len(each["group"])
                for each in self.worker_spec.values()
            )

        elif hasattr(self, "nodes"):
            requested = len(self.nodes)
        else:
            requested = nodes

        nodes = self._format_nodes(nodes, requested, self.use_gpu, self.n_gpus_per_node)

        cores = sum(v["nthreads"] for v in self.scheduler_info["workers"].values())
        cores_or_gpus = "Workers (GPUs)" if self.use_gpu else "Workers (vCPUs)"

        memory = (
            sum(
                v["gpu"]["memory-total"][0]
                for v in self.scheduler_info["workers"].values()
            )
            if self.use_gpu
            else sum(v["memory_limit"] for v in self.scheduler_info["workers"].values())
        )
        memory = format_bytes(memory)

        text = """
<div>
  <style scoped>
    .dataframe tbody tr th:only-of-type {
        vertical-align: middle;
    }
    .dataframe tbody tr th {
        vertical-align: top;
    }
    .dataframe thead th {
        text-align: right;
    }
  </style>
  <table style="text-align: right;">
    <tr> <th>Nodes</th> <td>%s</td></tr>
    <tr> <th>%s</th> <td>%s</td></tr>
    <tr> <th>Memory</th> <td>%s</td></tr>
  </table>
</div>
""" % (
            nodes,
            cores_or_gpus,
            cores,
            memory,
        )
        return text
Exemple #8
0
def make_cluster_model(
    cluster_id: str,
    cluster_name: str,
    cluster: Cluster,
    adaptive: Union[Adaptive, None],
) -> ClusterModel:
    """
    Make a cluster model. This is a JSON-serializable representation
    of the information about a cluster that can be sent over the wire.

    Parameters
    ----------
    cluster_id: string
        A unique string for the cluster.

    cluster_name: string
        A display name for the cluster.

    cluster: Cluster
        The cluster out of which to make the cluster model.

    adaptive: Adaptive
        The adaptive controller for the number of workers for the cluster, or
        none if the cluster is not scaled adaptively.
    """
    # This would be a great target for a dataclass
    # once python 3.7 is in wider use.
    try:
        info = cluster.scheduler_info
    except AttributeError:
        info = cluster.scheduler.identity()
    try:
        cores = sum(d["nthreads"] for d in info["workers"].values())
    except KeyError:  # dask.__version__ < 2.0
        cores = sum(d["ncores"] for d in info["workers"].values())
    assert isinstance(info, dict)
    model = dict(
        id=cluster_id,
        name=cluster_name,
        scheduler_address=cluster.scheduler_address,
        dashboard_link=cluster.dashboard_link or "",
        workers=len(info["workers"]),
        memory=utils.format_bytes(
            sum(d["memory_limit"] for d in info["workers"].values())
        ),
        cores=cores,
    )
    if hasattr(cluster, "_supports_scaling"):
        model["supports_scaling"] = cluster._supports_scaling
    else:
        model["supports_scaling"] = True
    if adaptive:
        model["adapt"] = {"minimum": adaptive.minimum, "maximum": adaptive.maximum}

    return model
Exemple #9
0
    def __repr__(self):
        running_workers = self._count_active_workers()
        running_cores = running_workers * self.worker_process_threads
        total_jobs = len(self.pending_jobs) + len(self.running_jobs)
        total_workers = total_jobs * self.worker_processes
        running_memory = running_workers * self.worker_memory / self.worker_processes

        return (self.__class__.__name__ +
                '(cores=%d, memory=%s, workers=%d/%d, jobs=%d/%d)' %
                (running_cores, format_bytes(running_memory), running_workers,
                 total_workers, len(self.running_jobs), total_jobs))
Exemple #10
0
    def _widget_status(self):
        try:
            workers = self.scheduler_info["workers"]
        except KeyError:
            return None
        else:
            n_workers = len(workers)
            cores = sum(w["nthreads"] for w in workers.values())
            memory = sum(w["memory_limit"] for w in workers.values())

            return _widget_status_template % (n_workers, cores, format_bytes(memory))
Exemple #11
0
def test_pause_executor(c, s, a):
    memory = psutil.Process().memory_info().rss
    a.memory_limit = memory / 0.8 + 200e6
    np = pytest.importorskip('numpy')

    def f():
        x = np.ones(int(300e6), dtype='u1')
        sleep(1)

    with captured_logger(logging.getLogger('distributed.worker')) as logger:
        future = c.submit(f)
        futures = c.map(slowinc, range(10), delay=0.1)

        yield gen.sleep(0.3)
        assert a.paused, (format_bytes(psutil.Process().memory_info().rss),
                          format_bytes(a.memory_limit))
        out = logger.getvalue()
        assert 'memory' in out.lower()
        assert 'pausing' in out.lower()

    assert sum(f.status == 'finished' for f in futures) < 4

    yield wait(futures)
Exemple #12
0
def test_pause_executor(c, s, a):
    memory = psutil.Process().memory_info().rss
    a.memory_limit = memory / 0.8 + 200e6
    np = pytest.importorskip('numpy')

    def f():
        x = np.ones(int(300e6), dtype='u1')
        sleep(1)

    with captured_logger(logging.getLogger('distributed.worker')) as logger:
        future = c.submit(f)
        futures = c.map(slowinc, range(10), delay=0.1)

        yield gen.sleep(0.3)
        assert a.paused, (format_bytes(psutil.Process().memory_info().rss),
                          format_bytes(a.memory_limit))
        out = logger.getvalue()
        assert 'memory' in out.lower()
        assert 'pausing' in out.lower()

    assert sum(f.status == 'finished' for f in futures) < 4

    yield wait(futures)
Exemple #13
0
    def _widget_status(self):
        if self._internal_client is None:
            return None
        try:
            workers = self._internal_client._scheduler_identity["workers"]
        except KeyError:
            if self._internal_client.status in ("closing", "closed"):
                return None
        else:
            n_workers = len(workers)
            cores = sum(w["nthreads"] for w in workers.values())
            memory = sum(w["memory_limit"] for w in workers.values())

            return _widget_status_template % (n_workers, cores,
                                              format_bytes(memory))
Exemple #14
0
def main(args=None):
    args = parse_args(args)

    if args.protocol == 'ucx':
        sched_str = "ucx://"+ args.server + ":13337"
        client = Client(sched_str)
    elif args.protocol == 'tcp':
        sched_str = "tcp://"+ args.server + ":13337"
        client = Client(sched_str)
    else:
        kwargs = {'n_workers': 2, 'threads_per_worker': 40}
        kwargs['processes'] = args.protocol == 'tcp'
        cluster = LocalCluster(**kwargs)
        client = Client(cluster)

    print(f"Connected to {client}")
    N = int(args.length)
    P = int(args.length)
    RS = da.random.RandomState(RandomState=cupy.random.RandomState)
    #RS = da.random.RandomState(123)
    X = RS.normal(10, 1, size=(N, P))
    #X = da.random.uniform(size=(N, P), chunks=(N/100, P/100))
    X.persist()
    print(format_bytes(X.nbytes))

    result = (X + X.T).sum() #(x + x.T).sum().compute()
    start = clock()
    result.compute()
    #with get_task_stream() as ts:
    #    result.compute()
    stop = clock()
    #print(ts.data)
    print(result)
    print(format_bytes(X.nbytes))
    print(f"\tTook {stop - start:0.2f}s")
    time.sleep(1)
Exemple #15
0
def main(args=None):
    args = parse_args(args)
    client = Client(args.scheduler_address)  # noqa
    X = da.random.random(size=(100_000, 10_000), chunks=1_000)

    protocol = client.scheduler_info()['address'].split(":")[0]
    ctx = base.maybe_setup_profile(args.profile, 'bench-array-ops', protocol)
    x = X[:10].dot(X.T).sum(1)

    print("Array size:", format_bytes(X.nbytes))
    print("Client    :", client)
    print("Profile?  :", "yes" if args.profile else "no")
    print("-" * 80)

    with ctx:
        start = clock()
        dask.compute(x.sum(), x.mean(), x.std())
        stop = clock()
    print(f"\t Took {stop - start:0.2f}s")
Exemple #16
0
def main(args=None):
    args = parse_args(args)
    client = Client(address=args.scheduler_address)
    protocol = client.scheduler_info()['address'].split(":")[0]
    ctx = base.maybe_setup_profile(args.profile, 'dot-product', protocol)

    print(f"Connected to {client}")
    N = 1_000_000
    P = 1_000
    X = da.random.uniform(size=(N, P), chunks=(N // 100, P))
    print(format_bytes(X.nbytes))

    result = X.T.dot(X)
    with ctx:
        start = clock()
        result.compute()
        stop = clock()

    print(f"\tTook {stop - start:0.2f}s")
Exemple #17
0
async def connect(host, port, n_bytes, n_iter, recv, np, verbose,
                  increment):
    ep = ucp.get_endpoint(host.encode(), port)
    arr = np.zeros(n_bytes, dtype='u1')

    start = clock()

    for i in range(n_iter):
        await ep.send_obj(arr)
        if recv == 'recv_into':
            await ep.recv_into(arr, arr.nbytes)
        else:
            # This is failing right now
            msg = await ep.recv_obj(arr.nbytes, cuda=np.__name__ == 'cupy')
            arr = np.asarray(msg.get_obj())

    stop = clock()

    expected = np.ones(n_bytes, dtype='u1')
    #            0 or n_iter
    expected *= (int(increment) * n_iter)
    np.testing.assert_array_equal(arr, expected)

    took = stop - start

    # 2 for round-trip, n_iter for number of trips.
    print("Roundtrip benchmark")
    print("-------------------")
    print(f"n_iter   | {n_iter}")
    print(f"n_bytes  | {format_bytes(n_bytes)}")
    print(f"recv     | {recv}")
    print(f"object   | {np.__name__}")
    print(f"inc      | {increment}")
    print("\n===================")
    print(format_bytes(2 * n_iter * arr.nbytes / took), '/ s')
    print("===================")

    await ep.recv_future()
    await ep.send_obj(np.ones(1))
    ep.close()
Exemple #18
0
def main():
    args = parse_args()
    q1 = mp.Queue()
    p1 = mp.Process(target=server, args=(q1, args))
    p1.start()
    port = q1.get()
    q2 = mp.Queue()
    p2 = mp.Process(target=client, args=(q2, port, args))
    p2.start()
    times = q2.get()
    p1.join()
    p2.join()
    assert not p1.exitcode
    assert not p2.exitcode
    assert len(times) == args.n_iter

    print("Roundtrip benchmark")
    print("--------------------------")
    print(f"n_iter      | {args.n_iter}")
    print(f"n_bytes     | {format_bytes(args.n_bytes)}")
    print(f"object      | {args.object_type}")
    print(f"reuse alloc | {args.reuse_alloc}")
    print("==========================")
    if args.object_type == "numpy":
        print(f"Device(s)    | Single CPU")
    else:
        print(f"Device(s)   | {args.server_dev}, {args.client_dev}")
    print(
        f"Average     | {format_bytes(2 * args.n_iter * args.n_bytes / sum(times))}/s"
    )
    print("--------------------------")
    print("Iterations")
    print("--------------------------")
    for i, t in enumerate(times):
        ts = format_bytes(2 * args.n_bytes / t)
        ts = (" " * (9 - len(ts))) + ts
        print("%03d         |%s/s" % (i, ts))
async def connect(host, port, n_bytes, n_iter, recv, np, verbose, increment):
    """
    connect to server and write data
    """

    ep = await ucp.create_endpoint(host, port)
    msg = np.zeros(n_bytes, dtype="u1")
    msg_size = numpy.array([msg.nbytes], dtype=np.uint64)

    start = clock()

    for i in range(n_iter):
        # send first message
        await ep.send(msg, msg_size)  # send the real message
        resp = np.empty_like(msg)
        await ep.recv(resp, msg_size)  # receive the echo

    stop = clock()

    expected = np.ones(n_bytes, dtype="u1")
    expected *= int(increment) * n_iter
    np.testing.assert_array_equal(msg, expected)

    took = stop - start

    # 2 for round-trip, n_iter for number of trips.
    print("Roundtrip benchmark")
    print("-------------------")
    print(f"n_iter   | {n_iter}")
    print(f"n_bytes  | {format_bytes(n_bytes)}")
    print(f"recv     | {recv}")
    print(f"object   | {np.__name__}")
    print(f"inc      | {increment}")
    print("\n===================")
    print(format_bytes(2 * n_iter * msg.nbytes / took), "/ s")
    print("===================")
Exemple #20
0
from dask.distributed import Client
import time
#client = Client(processes=False, threads_per_worker=4, n_workers=1, memory_limit='2GB')
client = Client('localhost:8786')
print(client)

import dask
from distributed.utils import format_bytes

import dask_ml.cluster
import dask_ml.datasets


X, y = dask_ml.datasets.make_blobs(
    n_samples=100000,
    n_features=50,
    centers=3,
    chunks=10000,
)

format_bytes(X.nbytes)

X = X.persist()

km = dask_ml.cluster.KMeans(n_clusters=3, init_max_iter=2, oversampling_factor=10, random_state=0)

t = time.time()
km.fit(X)
print('Time kmeans distributed:' time.time()-t)
Exemple #21
0
    def __init__(self,
                 name=None,
                 cores=None,
                 memory=None,
                 processes=None,
                 interface=None,
                 death_timeout=None,
                 local_directory=None,
                 extra=None,
                 env_extra=None,
                 log_directory=None,
                 walltime=None,
                 threads=None,
                 python=sys.executable,
                 **kwargs):
        """ """
        # """
        # This initializer should be considered as Abstract, and never used directly.
        # """
        if threads is not None:
            raise ValueError(threads_deprecation_message)

        if not self.scheduler_name:
            raise NotImplementedError(
                'JobQueueCluster is an abstract class that should not be instanciated.'
            )

        if name is None:
            name = dask.config.get('jobqueue.%s.name' % self.scheduler_name)
        if cores is None:
            cores = dask.config.get('jobqueue.%s.cores' % self.scheduler_name)
        if memory is None:
            memory = dask.config.get('jobqueue.%s.memory' %
                                     self.scheduler_name)
        if processes is None:
            processes = dask.config.get('jobqueue.%s.processes' %
                                        self.scheduler_name)
        if interface is None:
            interface = dask.config.get('jobqueue.%s.interface' %
                                        self.scheduler_name)
        if death_timeout is None:
            death_timeout = dask.config.get('jobqueue.%s.death-timeout' %
                                            self.scheduler_name)
        if local_directory is None:
            local_directory = dask.config.get('jobqueue.%s.local-directory' %
                                              self.scheduler_name)
        if extra is None:
            extra = dask.config.get('jobqueue.%s.extra' % self.scheduler_name)
        if env_extra is None:
            env_extra = dask.config.get('jobqueue.%s.env-extra' %
                                        self.scheduler_name)
        if log_directory is None:
            log_directory = dask.config.get('jobqueue.%s.log-directory' %
                                            self.scheduler_name)

        if dask.config.get('jobqueue.%s.threads', None):
            warnings.warn(threads_deprecation_message)

        if cores is None:
            raise ValueError(
                "You must specify how many cores to use per job like ``cores=8``"
            )

        if memory is None:
            raise ValueError(
                "You must specify how much memory to use per job like ``memory='24 GB'``"
            )

        # This attribute should be overridden
        self.job_header = None

        if interface:
            extra += ['--interface', interface]
            kwargs.setdefault('ip', get_ip_interface(interface))
        else:
            kwargs.setdefault('ip', '')

        # Bokeh diagnostics server should listen on all interfaces
        diagnostics_ip_and_port = ('', 8787)
        self.local_cluster = LocalCluster(
            n_workers=0, diagnostics_port=diagnostics_ip_and_port, **kwargs)

        # Keep information on process, cores, and memory, for use in subclasses
        self.worker_memory = parse_bytes(
            memory) if memory is not None else None
        self.worker_processes = processes
        self.worker_cores = cores
        self.name = name

        # plugin for tracking job status
        self._scheduler_plugin = JobQueuePlugin()
        self.local_cluster.scheduler.add_plugin(self._scheduler_plugin)

        self._adaptive = None

        self._env_header = '\n'.join(env_extra)

        # dask-worker command line build
        dask_worker_command = '%(python)s -m distributed.cli.dask_worker' % dict(
            python=python)
        command_args = [dask_worker_command, self.scheduler.address]
        command_args += ['--nthreads', self.worker_threads]
        if processes is not None and processes > 1:
            command_args += ['--nprocs', processes]

        mem = format_bytes(self.worker_memory / self.worker_processes)
        command_args += ['--memory-limit', mem.replace(' ', '')]
        command_args += ['--name', '%s--${JOB_ID}--' % name]

        if death_timeout is not None:
            command_args += ['--death-timeout', death_timeout]
        if local_directory is not None:
            command_args += ['--local-directory', local_directory]
        if extra is not None:
            command_args += extra

        self._command_template = ' '.join(map(str, command_args))

        self._target_scale = 0

        self.log_directory = log_directory
        if self.log_directory is not None:
            if not os.path.exists(self.log_directory):
                os.makedirs(self.log_directory)
Exemple #22
0
 def worker_process_memory(self):
     mem = format_bytes(self.worker_memory / self.worker_processes)
     mem = mem.replace(" ", "")
     return mem
Exemple #23
0
                           broadcast=True)
    cl_bin_utils = scatter_dict(kappa0.cl_bin_utils, broadcast=True)
    xi_bin_utils = scatter_dict(kappa0.xi_bin_utils, broadcast=True)

    ##`fix_cosmo` sets only one cosmology. If `False`, then it recalculates the power spectrum every time.
    if fix_cosmo:
        kappa0.Ang_PS.angular_power_z()
    else:
        kappa0.Ang_PS.reset()
    print('kappa0 pk', kappa0.Ang_PS.PS.pk_func)

    kappa0 = client.scatter(kappa0, broadcast=True)

    proc = psutil.Process()
    print('starting mcmc ', 'mem, peak mem: ',
          format_bytes(proc.memory_info().rss),
          int(getrusage(RUSAGE_SELF).ru_maxrss / 1024. / 1024.))

    ##define functions
    def get_priors(params):  #assume flat priors for now
        x = np.logical_or(np.any(params > priors_max, axis=1),
                          np.any(params < priors_min, axis=1))
        p = np.zeros(len(params))
        p[x] = -np.inf
        return p

    def assign_zparams(zbins={}, p_name='', p_value=None):
        pp = p_name.split('_')
        p_n = pp[0]
        bin_indx = np.int(pp[1])
        zbins[bin_indx][p_n] = p_value
Exemple #24
0
def client(queue, port, server_address, args):
    if args.client_cpu_affinity >= 0:
        os.sched_setaffinity(0, [args.client_cpu_affinity])

    ucp.init()

    if args.object_type == "numpy":
        import numpy as np
    elif args.object_type == "cupy":
        import cupy as np

        np.cuda.runtime.setDevice(args.client_dev)
    else:
        import cupy as np

        import rmm

        rmm.reinitialize(
            pool_allocator=True,
            managed_memory=False,
            initial_pool_size=args.rmm_init_pool_size,
            devices=[args.client_dev],
        )
        np.cuda.runtime.setDevice(args.client_dev)
        np.cuda.set_allocator(rmm.rmm_cupy_allocator)

    async def run():
        ep = await ucp.create_endpoint(server_address, port)

        msg_send_list = []
        msg_recv_list = []
        if not args.reuse_alloc:
            for i in range(args.n_iter):
                msg_send_list.append(np.arange(args.n_bytes, dtype="u1"))
                msg_recv_list.append(np.zeros(args.n_bytes, dtype="u1"))
        else:
            t1 = np.arange(args.n_bytes, dtype="u1")
            t2 = np.zeros(args.n_bytes, dtype="u1")
            for i in range(args.n_iter):
                msg_send_list.append(t1)
                msg_recv_list.append(t2)
        assert msg_send_list[0].nbytes == args.n_bytes
        assert msg_recv_list[0].nbytes == args.n_bytes
        if args.cuda_profile:
            np.cuda.profiler.start()
        times = []
        for i in range(args.n_iter):
            start = clock()
            await ep.send(msg_send_list[i], args.n_bytes)
            await ep.recv(msg_recv_list[i], args.n_bytes)
            stop = clock()
            times.append(stop - start)
        if args.cuda_profile:
            np.cuda.profiler.stop()
        queue.put(times)

    loop = asyncio.get_event_loop()
    loop.run_until_complete(run())
    loop.close()
    times = queue.get()
    assert len(times) == args.n_iter
    print("Roundtrip benchmark")
    print("--------------------------")
    print(f"n_iter      | {args.n_iter}")
    print(f"n_bytes     | {format_bytes(args.n_bytes)}")
    print(f"object      | {args.object_type}")
    print(f"reuse alloc | {args.reuse_alloc}")
    print("==========================")
    if args.object_type == "numpy":
        print("Device(s)   | CPU-only")
        s_aff = (args.server_cpu_affinity
                 if args.server_cpu_affinity >= 0 else "affinity not set")
        c_aff = (args.client_cpu_affinity
                 if args.client_cpu_affinity >= 0 else "affinity not set")
        print(f"Server CPU  | {s_aff}")
        print(f"Client CPU  | {c_aff}")
    else:
        print(f"Device(s)   | {args.server_dev}, {args.client_dev}")
    print(
        f"Average     | {format_bytes(2 * args.n_iter * args.n_bytes / sum(times))}/s"
    )
    print("--------------------------")
    print("Iterations")
    print("--------------------------")
    for i, t in enumerate(times):
        ts = format_bytes(2 * args.n_bytes / t)
        ts = (" " * (9 - len(ts))) + ts
        print("%03d         |%s/s" % (i, ts))
Exemple #25
0
    def run(self):
        logger.warning('Reading configuration YAML config file')
        operation_choice = self.params['operation_choice']
        machine = self.params['machine']
        job_scheduler = self.params['job_scheduler']
        queue = self.params['queue']
        walltime = self.params['walltime']
        maxmemory_per_node = self.params['maxmemory_per_node']
        maxcore_per_node = self.params['maxcore_per_node']
        chunk_per_worker = self.params['chunk_per_worker']
        freq = self.params['freq']
        spil = self.params['spil']
        output_dir = self.params.get('output_dir', results_dir)
        now = datetime.datetime.now()
        output_dir = os.path.join(output_dir, f'{machine}/{str(now.date())}')
        os.makedirs(output_dir, exist_ok=True)
        parameters = self.params['parameters']
        num_workers = parameters['number_of_workers_per_nodes']
        num_threads = parameters.get('number_of_threads_per_workers', 1)
        num_nodes = parameters['number_of_nodes']
        chunking_schemes = parameters['chunking_scheme']
        io_formats = parameters['io_format']
        filesystems = parameters['filesystem']
        fixed_totalsize = parameters['fixed_totalsize']
        chsz = parameters['chunk_size']
        writefile_dir = parameters['writefile_dir']
        for wpn in num_workers:
            self.create_cluster(
                job_scheduler=job_scheduler,
                maxcore=maxcore_per_node,
                walltime=walltime,
                memory=maxmemory_per_node,
                queue=queue,
                wpn=wpn,
            )
            for num in num_nodes:
                self.client.cluster.scale(num * wpn)
                cluster_wait(self.client, num * wpn)
                timer = DiagnosticTimer()
                # dfs = []
                logger.warning(
                    '#####################################################################\n'
                    f'Dask cluster:\n'
                    f'\t{self.client.cluster}\n')
                now = datetime.datetime.now()
                csv_filename = f"{output_dir}/compute_study_{now.strftime('%Y-%m-%d_%H-%M-%S')}.csv"
                for chunk_size in chsz:

                    for io_format in io_formats:

                        for filesystem in filesystems:

                            if filesystem == 's3':
                                profile = parameters['profile']
                                bucket = parameters['bucket']
                                endpoint_url = parameters['endpoint_url']
                                fs = fsspec.filesystem(
                                    's3',
                                    profile=profile,
                                    anon=False,
                                    client_kwargs={
                                        'endpoint_url': endpoint_url
                                    },
                                )
                                root = f'{bucket}/test1'
                            elif filesystem == 'posix':
                                fs = LocalFileSystem()
                                root = writefile_dir
                                if not os.path.isdir(f'{root}'):
                                    os.makedirs(f'{root}')
                            for chunking_scheme in chunking_schemes:

                                logger.warning(
                                    f'Benchmark starting with: \n\tworker_per_node = {wpn},'
                                    f'\n\tnum_nodes = {num}, \n\tchunk_size = {chunk_size},'
                                    f'\n\tchunking_scheme = {chunking_scheme},'
                                    f'\n\tchunk per worker = {chunk_per_worker}'
                                    f'\n\tio_format = {io_format}'
                                    f'\n\tfilesystem = {filesystem}')
                                ds, chunks = timeseries(
                                    fixed_totalsize=fixed_totalsize,
                                    chunk_per_worker=chunk_per_worker,
                                    chunk_size=chunk_size,
                                    chunking_scheme=chunking_scheme,
                                    io_format=io_format,
                                    num_nodes=num,
                                    freq=freq,
                                    worker_per_node=wpn,
                                )
                                # wait(ds)
                                dataset_size = format_bytes(ds.nbytes)
                                logger.warning(ds)
                                logger.warning(
                                    f'Dataset total size: {dataset_size}')

                                for op in self.operations[operation_choice]:
                                    with timer.time(
                                            'runtime',
                                            operation=op.__name__,
                                            fixed_totalsize=fixed_totalsize,
                                            chunk_size=chunk_size,
                                            chunk_per_worker=chunk_per_worker,
                                            dataset_size=dataset_size,
                                            worker_per_node=wpn,
                                            threads_per_worker=num_threads,
                                            num_nodes=num,
                                            chunking_scheme=chunking_scheme,
                                            io_format=io_format,
                                            filesystem=filesystem,
                                            root=root,
                                            machine=machine,
                                            maxmemory_per_node=
                                            maxmemory_per_node,
                                            maxcore_per_node=maxcore_per_node,
                                            spil=spil,
                                    ):
                                        fname = f'{chunk_size}{chunking_scheme}{filesystem}{num}'
                                        if op.__name__ == 'writefile':
                                            print(ds.sst.data.chunksize)
                                            filename = op(
                                                ds, fs, io_format, root, fname)
                                        elif op.__name__ == 'openfile':
                                            ds = op(fs, io_format, root,
                                                    chunks, chunk_size)
                                        elif op.__name__ == 'deletefile':
                                            ds = op(fs, io_format, root,
                                                    filename)
                                        else:
                                            op(ds)
                        # kills ds, and every other dependent computation
                        logger.warning('Computation done')
                        self.client.cancel(ds)
                        temp_df = timer.dataframe()
                        temp_df.to_csv(csv_filename, index=False)
                        # dfs.append(temp_df)

                # now = datetime.datetime.now()
                # filename = f"{output_dir}/compute_study_{now.strftime('%Y-%m-%d_%H-%M-%S')}.csv"
                # df = pd.concat(dfs)
                # df.to_csv(filename, index=False)
                logger.warning(
                    f'Persisted benchmark result file: {csv_filename}')

            logger.warning(
                'Shutting down the client and cluster before changing number of workers per nodes'
            )
            self.client.cluster.close()
            logger.warning('Cluster shutdown finished')
            self.client.close()
            logger.warning('Client shutdown finished')

        logger.warning('=====> The End <=========')
Exemple #26
0
    def generate_maps(self, ):
        client = client_get(scheduler_info=self.scheduler_info)
        SJ = client.scatter(self, broadcast=True)
        step = self.nworkers * self.njobs_submit_per_worker  # min(nsim,len(client.scheduler_info()['workers']))

        i = 0
        j = 0
        futures = [delayed(get_clsim)(SJ, i) for i in np.arange(self.nsim)]
        futures_done = []
        while j < self.nsim:
            futures_j = client.compute(futures[j:j + step])
            wait_futures(futures_j)
            futures_done += futures_j
            j += step
        del futures

        if self.kappa_class.do_pseudo_cl:
            self.cl_b = {
                im: {
                    'full': np.zeros(self.sim_clb_shape, dtype='float32')
                }
                for im in self.Master_algs
            }
            for im in self.Master_algs:
                self.cl_b[im].update({jks: {} for jks in self.jk_stat_keys})

            self.pcl_b = {
                'full': np.zeros(self.sim_clb_shape, dtype='float32')
            }
            self.pcl_b.update({jks: {} for jks in self.jk_stat_keys})
        if self.do_xi:
            self.xi_b = {
                'full': np.zeros(self.sim_xib_shape, dtype='float32')
            }  #  {im:np.zeros(sim_clb_shape,dtype='float32') for im in Master_algs}}
            self.xi_b.update({
                jks: {}
                for jks in self.jk_stat_keys
            })  #{im:{} for im in Master_algs} for jks in jk_stat_keys})
            im = 'xi_imaster'
            self.cl_b[im] = {
                'full': np.zeros(self.sim_clb_shape, dtype='float32')
            }
            self.cl_b[im].update({jks: {} for jks in self.jk_stat_keys})

        for i in np.arange(self.nsim):
            tt = futures_done[i].result()
            if self.kappa_class.do_pseudo_cl:
                self.pcl_b[i] = tt[0]
                for k in self.Master_algs:
                    self.cl_b[k][i] = tt[1][k]
            if self.do_xi:
                self.xi_b[i] = tt[2]
                k = 'xi_imaster'
                self.cl_b[k][i] = tt[1][k]

            client.cancel(futures_done[i])
        proc = psutil.Process()
        print('done map ', i, thread_count(), 'mem, peak mem: ',
              format_bytes(proc.memory_info().rss),
              int(getrusage(RUSAGE_SELF).ru_maxrss / 1024. / 1024.))
        #         del futures_done
        print('done map ', i, thread_count(), 'mem, peak mem: ',
              format_bytes(proc.memory_info().rss),
              int(getrusage(RUSAGE_SELF).ru_maxrss / 1024. / 1024.))
        j += step
Exemple #27
0
    def __init__(self,
                 name=None,
                 cores=None,
                 memory=None,
                 processes=None,
                 interface=None,
                 death_timeout=None,
                 local_directory=None,
                 extra=None,
                 env_extra=None,
                 walltime=None,
                 threads=None,
                 **kwargs
                 ):
        """ """
        # """
        # This initializer should be considered as Abstract, and never used
        # directly.
        # """
        if threads is not None:
            raise ValueError(threads_deprecation_message)

        if not self.scheduler_name:
            raise NotImplementedError('JobQueueCluster is an abstract class '
                                      'that should not be instanciated.')

        if name is None:
            name = dask.config.get('jobqueue.%s.name' % self.scheduler_name)
        if cores is None:
            cores = dask.config.get('jobqueue.%s.cores' % self.scheduler_name)
        if memory is None:
            memory = dask.config.get('jobqueue.%s.memory' % self.scheduler_name)
        if processes is None:
            processes = dask.config.get('jobqueue.%s.processes' % self.scheduler_name)
        if interface is None:
            interface = dask.config.get('jobqueue.%s.interface' % self.scheduler_name)
        if death_timeout is None:
            death_timeout = dask.config.get('jobqueue.%s.death-timeout' % self.scheduler_name)
        if local_directory is None:
            local_directory = dask.config.get('jobqueue.%s.local-directory' % self.scheduler_name)
        if extra is None:
            extra = dask.config.get('jobqueue.%s.extra' % self.scheduler_name)
        if env_extra is None:
            env_extra = dask.config.get('jobqueue.%s.env-extra' % self.scheduler_name)

        if dask.config.get('jobqueue.%s.threads', None):
            warnings.warn(threads_deprecation_message)

        if cores is None:
            raise ValueError("You must specify how many cores to use per job "
                             "like ``cores=8``")

        if memory is None:
            raise ValueError("You must specify how much memory to use per job "
                             "like ``memory='24 GB'``")

        #This attribute should be overriden
        self.job_header = None

        if interface:
            host = get_ip_interface(interface)
            extra += ' --interface  %s ' % interface
        else:
            host = socket.gethostname()

        self.local_cluster = LocalCluster(n_workers=0, ip=host, **kwargs)

        # Keep information on process, cores, and memory, for use in subclasses
        self.worker_memory = parse_bytes(memory)

        self.worker_processes = processes
        self.worker_cores = cores
        self.name = name

        self.jobs = dict()
        self.n = 0
        self._adaptive = None

        self._env_header = '\n'.join(env_extra)

        # dask-worker command line build
        dask_worker_command = (
            '%(python)s -m distributed.cli.dask_worker' % dict(python=sys.executable))
        self._command_template = ' '.join([dask_worker_command, self.scheduler.address])
        self._command_template += " --nthreads %d" % self.worker_threads
        if processes is not None and processes > 1:
            self._command_template += " --nprocs %d" % processes

        mem = format_bytes(self.worker_memory / self.worker_processes)
        mem = mem.replace(' ', '')
        self._command_template += " --memory-limit %s" % mem

        if name is not None:
            self._command_template += " --name %s" % name
            self._command_template += "-%(n)d" # Keep %(n) to be replaced later
        if death_timeout is not None:
            self._command_template += " --death-timeout %s" % death_timeout
        if local_directory is not None:
            self._command_template += " --local-directory %s" % local_directory
        if extra is not None:
            self._command_template += extra