Exemple #1
0
    def connect(cls, *args, **kwargs):
        """
        Setup slurm cluster
        """

        # Create a slurm cluster with all the various class settings
        cls.local_cluster = cls.cluster_controller_class(
            queue=cls.queue,
            project=cls.project,
            walltime=cls.walltime,
            job_cpu=cls.job_cpu,
            cores=cls.cores,
            processes=cls.processes,
            job_mem=cls.job_mem,
            env_extra=cls.env_extra,
            interface=cls.interface,
            local_directory=cls.local_directory,
            memory=cls.memory,
            job_extra=cls.cluster_controller_options)

        # Hacks for older version of dask_jobqueue
        try:
            if cls.worker_memory_limit == 0:
                cls.local_cluster._command_template = memory_limit_0(
                    cls.local_cluster._command_template)
            if cls.hack_cluster_controller_for_NYU:
                cls.local_cluster.job_header = fix_header_for_nyu(
                    cls.local_cluster.job_header, DEFAULT_NYU_HEADER)

        # Hacks for newer version of dask_jobqueue
        except AttributeError:
            from dask_jobqueue.slurm import SLURMJob

            class SLURMJobMemLimit(SLURMJob):
                def __init__(self, *args, **kwargs):
                    super(SLURMJobMemLimit, self).__init__(*args, **kwargs)
                    if cls.worker_memory_limit == 0:
                        self._command_template = memory_limit_0(
                            self._command_template)
                    if cls.hack_cluster_controller_for_NYU:
                        self.job_header = fix_header_for_nyu(
                            self.job_header, DEFAULT_NYU_HEADER)

            cls.local_cluster.job_cls = SLURMJobMemLimit

        if cls.control_adaptive:
            cls.local_cluster.adapt(minimum=cls.minimum_cores,
                                    maximum=cls.maximum_cores,
                                    interval=cls.interval,
                                    wait_count=cls.wait_count)
        else:
            cls.local_cluster.adapt(minimum=cls.maximum_cores,
                                    maximum=cls.maximum_cores,
                                    interval=cls.interval,
                                    wait_count=cls.wait_count)

        cls.local_cluster.scheduler.allowed_failures = cls.allowed_failures
        cls.client = distributed.Client(cls.local_cluster)

        return True
    def connect(cls, *args, **kwargs):
        """
        Setup slurm cluster
        """

        # Create a slurm cluster with all the various class settings
        cls.local_cluster = cls.cluster_controller_class(queue=cls.queue, project=cls.project, walltime=cls.walltime,
                                                         job_cpu=cls.job_cpu, cores=cls.cores, processes=cls.processes,
                                                         job_mem=cls.job_mem, env_extra=cls.env_extra,
                                                         interface=cls.interface, local_directory=cls.local_directory,
                                                         memory=cls.memory, job_extra=cls.cluster_controller_options)

        # Deactivate the worker memory nanny
        if cls.worker_memory_limit == 0:
            cls.local_cluster._command_template = memory_limit_0(cls.local_cluster._command_template)

        # Rewrite the command headers so that the SLURM controller will work with the NYU prince cluster
        if cls.hack_cluster_controller_for_NYU:
            cls.local_cluster.job_header = fix_header_for_nyu(cls.local_cluster.job_header, DEFAULT_NYU_HEADER)

        if cls.control_adaptive:
            cls.local_cluster.adapt(minimum=cls.minimum_cores, maximum=cls.maximum_cores, interval=cls.interval,
                                    wait_count=cls.wait_count)
        else:
            cls.local_cluster.adapt(minimum=cls.maximum_cores, maximum=cls.maximum_cores, interval=cls.interval,
                                    wait_count=cls.wait_count)

        cls.local_cluster.scheduler.allowed_failures = cls.allowed_failures
        cls.client = distributed.Client(cls.local_cluster)

        return True
    def connect(cls, *args, **kwargs):
        """
        Setup slurm cluster
        """

        # Create a slurm cluster with all the various class settings
        cls._local_cluster = cls._cluster_controller_class(queue=cls._queue,
                                                           project=cls._project,
                                                           interface=cls._interface,
                                                           walltime=cls._job_time,
                                                           job_cpu=cls._job_n_workers * cls._worker_n_threads,
                                                           cores=cls._job_n_workers * cls._worker_n_threads,
                                                           processes=cls._job_n_workers,
                                                           job_mem=cls._job_mem,
                                                           env_extra=cls._config_env(),
                                                           local_directory=cls._local_directory,
                                                           memory=cls._job_mem,
                                                           job_extra=cls._job_slurm_commands,
                                                           job_cls=SLURMJobNoMemLimit)

        cls.client = distributed.Client(cls._local_cluster, direct_to_workers=True)

        cls._add_local_node_workers(cls._num_local_workers)
        cls._tracker = WorkerTracker()

        utils.Debug.vprint("Dask dashboard: {cl}".format(cl = cls.client.dashboard_link), level=0)

        return True
Exemple #4
0
    def connect(cls,
                scheduler_uri,
                *args,
                client_kwargs: Optional[dict] = None,
                **kwargs):
        """
        Connect to a remote dask scheduler.

        Parameters
        ----------
        scheduler_uri: str
            Compatible with the :code:`address` parameter of :class:`distributed.Client`.
        client_kwargs: dict or None
            Passed as kwargs to :class:`distributed.Client`.
            :code:`client_kwargs['set_as_default']` is set to :code:`False`
            unless specified otherwise to avoid interference with Dask-based workflows.
            Pass :code:`client_kwargs={'set_as_default': True}` to set the Client as the
            default Dask scheduler and keep it running when the Context closes.
        *args, **kwargs: Passed to :class:`DaskJobExecutor`.

        Returns
        -------
        DaskJobExecutor
            the connected JobExecutor
        """
        if client_kwargs is None:
            client_kwargs = {}
        if client_kwargs.get('set_as_default') is None:
            client_kwargs['set_as_default'] = False
        is_local = not client_kwargs['set_as_default']
        client = dd.Client(address=scheduler_uri, **client_kwargs)
        return cls(client=client, is_local=is_local, *args, **kwargs)
Exemple #5
0
def local_cluster_url():
    """
    Shared dask cluster, can be used repeatedly by different executors.

    This allows numba caching across tests, without sharing the executor,
    for example
    """
    cluster_port = find_unused_port()
    devices = detect()
    spec = cluster_spec(
        # Only use at most 2 CPUs and 1 GPU
        cpus=devices['cpus'][:2],
        cudas=devices['cudas'][:1],
        has_cupy=devices['has_cupy'])

    cluster_kwargs = {
        'silence_logs': logging.WARN,
        'scheduler': {
            'cls': Scheduler,
            'options': {
                'port': cluster_port
            },
        },
    }

    with set_num_threads_env(1, set_numba=False):
        cluster = dd.SpecCluster(workers=spec, **(cluster_kwargs or {}))
        client = dd.Client(cluster, set_as_default=False)
        client.wait_for_workers(len(spec))
        client.close()

    yield 'tcp://localhost:%d' % cluster_port

    cluster.close()
Exemple #6
0
 def wrapped(df, *args, **kwargs):
     if cluster is None:
         my_cluster = get_or_create_cluster()
     else:
         my_cluster = cluster
     client = dd.Client(my_cluster)
     try:
         task_queue = dd.Queue()
         result_queue = dd.Queue()
         workers = []
         for _ in range(len(my_cluster.workers)):
             w = client.submit(worker, task_queue, result_queue, args,
                               kwargs)
             workers.append(w)
         with tempfile.TemporaryDirectory(
                 prefix=TMPDIR_PREFIX) as tmpdir:
             LOG.info('pandas parallel tmpdir={}', tmpdir)
             n_tasks = 0
             for df_path in parallel_df.stream_split(df, tmpdir):
                 task_queue.put(df_path)
                 n_tasks += 1
             task_queue.put(_TASK_END)
             range_n_tasks = range(n_tasks)
             if progress_bar:
                 range_n_tasks = tqdm.tqdm(range_n_tasks,
                                           ncols=80,
                                           ascii=True)
             if iterator:
                 return _get_result_iterator(range_n_tasks,
                                             result_queue)
             else:
                 return _get_result_dataframe(range_n_tasks,
                                              result_queue)
     finally:
         client.close()
Exemple #7
0
    def make_local(cls, spec=None, cluster_kwargs=None, client_kwargs=None):
        """
        Spin up a local dask cluster

        interesting cluster_kwargs:
            threads_per_worker
            n_workers

        Returns
        -------
        DaskJobExecutor
            the connected JobExecutor
        """
        if spec is None:
            spec = cluster_spec(**detect())
        if client_kwargs is None:
            client_kwargs = {}
        if client_kwargs.get('set_as_default') is None:
            client_kwargs['set_as_default'] = False

        if cluster_kwargs is None:
            cluster_kwargs = {}
        if cluster_kwargs.get('silence_logs') is None:
            cluster_kwargs['silence_logs'] = logging.WARN

        cluster = dd.SpecCluster(workers=spec, **(cluster_kwargs or {}))
        client = dd.Client(cluster, **(client_kwargs or {}))

        return cls(client=client, is_local=True, lt_resources=True)
Exemple #8
0
def main(path, scheduler_uri, stackheight, dtype, scan_size, detector_size,
         bench, which, skip, num_workers, num_nodes, warmup_rounds):
    scan_size = tuple(int(x) for x in scan_size.split(","))
    detector_size = tuple(int(x) for x in detector_size.split(","))
    (a, b) = scan_size
    (f, g) = detector_size

    # We have to make sure that this happens before numpy is loaded.
    if bench == 'libertem':
        for k in [
                'OMP_NUM_THREADS', 'OPENBLAS_NUM_THREADS', 'MKL_NUM_THREADS'
        ]:
            os.environ[k] = '1'

    import numpy as np

    roi = np.zeros(shape=scan_size, dtype=np.bool)
    roi[(a * 5) // 10:(a * 6) // 10, (b * 2) // 10:(b * 3) // 10] = True

    mask = np.zeros(shape=detector_size, dtype=np.float32)
    mask[(f * 5) // 10:(f * 6) // 10, (g * 2) // 10:(g * 3) // 10] = 1

    if bench == 'libertem':
        from LiberTEMBenchmark import LiberTEMBenchmark as Bm
    elif bench == 'numpy':
        from NumpyBenchmark import NumpyBenchmark as Bm
    elif bench == 'dask.distributed':
        import dask.distributed as dd
        dd.Client(address=scheduler_uri)
        from DaskBenchmark import DaskBenchmark as Bm
    elif bench == 'dask':
        from DaskBenchmark import DaskBenchmark as Bm

    b = Bm(path, dtype, scan_size, detector_size, warmup_rounds, roi, mask)
    print(json.dumps(b.bench_all(which, skip)))
def test_imports():
    client = dd.Client("127.0.0.1:8786")
    futures = client.map(remote_fiona_import, range(10))
    versions = client.gather(futures)
    print(
        f"remote fiona import test complete - found version {versions[0][:-1]}"
    )
Exemple #10
0
 def test_parallel(self, testcluster):
     # repeat selected test w/parallel processing engine
     client = dd.Client(testcluster)
     par_tests = ["test_dataselection"]
     for test in par_tests:
         getattr(self, test)()
     client.close()
Exemple #11
0
def client_startup(cluster, n_jobs: int, total_workers: int):
    """
    Start a dask client

    Args:
        cluster: Dask cluster
        n_jobs: number of jobs to submit to the cluster
        total_workers: number of total workers in the cluster

    Returns:
        DaskClient instance, list of workers ID
    """
    if n_jobs <= 0:
        raise ValueError("n_jobs must equal or greater than 1!")
    if isinstance(cluster, daskD.LocalCluster) or isinstance(cluster, KubeCluster):
        cluster.scale(n_jobs)
    else:
        cluster.scale(jobs=n_jobs)
    # Creating dask Client
    client = daskD.Client(cluster)
    workers = 0
    t0 = time()
    while workers < total_workers:
        workers = len(client.get_worker_logs().keys())
        # If the number of workers is not reached in 5 minutes raise exception
        if time() - t0 > 300.0:
            raise SystemError("Dask could not start the requested workers within 5 minutes!"
                              "Try different n_jobs.")
    WorkerIds = list(client.get_worker_logs().keys())
    return client, WorkerIds
Exemple #12
0
def initialize_dask(n, factor = 5, slurm = False):

    if not slurm:
        cores =  len(os.sched_getaffinity(0))
        cluster = distributed.LocalCluster(processes = False,
                                           n_workers = 1,
                                           threads_per_worker = 1)

    else:
        n = min(100, n)
        py = './enter_conda.sh python3'
        params = {
            'python' : py,
            'cores' : 1,
            'memory' : '512MB',
            'walltime' : '180',
            'processes' : 1,
            'job_extra' : [
                '--qos use-everything',
                '--array 0-{0:d}'.format(n - 1),
                '--requeue',
                '--output "/dev/null"'
            ],
            'env_extra' : [
                'JOB_ID=${SLURM_ARRAY_JOB_ID%;*}_${SLURM_ARRAY_TASK_ID%;*}',
                'source /etc/profile.d/modules.sh',
                'cd {0!s}'.format(CONFIG['PATHS', 'root']),
            ]
        }
        cluster = SLURMCluster(**params)
        print(cluster.job_script())
        cluster.scale(1)

    print(cluster.dashboard_link)
    return distributed.Client(cluster)
    def connect_to_scheduler(self, scheduler_address):
        """
        Start a connection to a running scheduler. The Python interpreter is not occupied.

        Args:
            scheduler_address: Address of running scheduler of the form 'ip-address:port'

        Example:
            >>> connect_to_scheduler('127.0.0.1:8791')

        References:
            - http://distributed.readthedocs.io/en/latest/client.html
            - http://distributed.readthedocs.io/en/latest/api.html#distributed.client.Client
        """
        from dask import distributed

        # Precondition
        _verify_address(scheduler_address)

        # Establish connection
        try:
            self._connection = distributed.Client(scheduler_address)
        except Exception as err:
            raise ConnectionError(
                'Could not connect to scheduler. Did you start it? '
                'Is it still running? Is the address correct?\n{}'.format(err))
Exemple #14
0
 def make_local(cls, cluster_kwargs=None, client_kwargs=None):
     """
     interesting cluster_kwargs:
         threads_per_worker
         n_workers
     """
     cluster = dd.LocalCluster(**(cluster_kwargs or {}))
     client = dd.Client(cluster, **(client_kwargs or {}))
     return cls(client=client, is_local=True)
Exemple #15
0
def client_in_background():
    # A running Dask client can introduce a timing issue
    # between automatic closing of a numpy.memmap object and renaming
    # the underlying file
    with dd.LocalCluster() as cluster:
        client = dd.Client(cluster, set_as_default=False)
        yield
        # to fix "distributed.client - ERROR - Failed to reconnect to scheduler after 10.00 seconds, closing client"  # NOQA
        client.close()
 def test_parallel(self, testcluster):
     # collect all tests of current class and repeat them in parallel
     client = dd.Client(testcluster)
     all_tests = [attr for attr in self.__dir__()
                  if (inspect.ismethod(getattr(self, attr)) and attr != "test_parallel")]
     for test in all_tests:
         getattr(self, test)()
         flush_local_cluster(testcluster)
     client.close()
Exemple #17
0
def client(tmpdir_factory, request):
    with tmpdir_factory.mktemp("dask_cluster").as_cwd():
        lc = distributed.LocalCluster(n_workers=request.param, processes=True)
        client = distributed.Client(lc)

        yield client

        client.close()
        lc.close()
def test_pickling():

    def pxfn(fp):
        """Pixel function for recipe. `ds` lives in the closure"""
        return np.ones(fp.shape) * id(ds)

    def slave():
        print('slave', dd.get_worker())
        """Slave process routine. `ds` and `oldid` live in the closure and are pickled by cloudpickle in Client.sumbit"""
        assert id(ds) != oldid, 'this test makes sense if `ds` was pickled'
        assert 'v1' in ds
        assert 'v2' not in ds
        assert 'r1' in ds
        assert 'r2' not in ds
        assert 'r3' in ds
        assert (ds._queued_count, ds._locked_count, ds.v1.activated, ds.r1.activated, ds.r3.activated) == (0, 0, False, False, True)
        assert ds.v1.get_data(0)[1] == str(oldid)
        assert (ds.r1.get_data() == oldid).all()
        assert (ds.r3.get_data() == id(ds)).all(), '`slave` and `pxfn` should share the same `ds` obj'

        ds.v1.insert_data((0, 1), ['42'])
        ds.r1.fill(42)
        assert ds.v1.get_data(1)[1] == '42'
        assert (ds.r1.get_data() == 42).all()

        ds.deactivate_all()

    ds = buzz.DataSource(max_activated=2)
    oldid = id(ds)
    fp = buzz.Footprint(
        tl=(1, 1),
        size=(10, 10),
        rsize=(10, 10),
    )
    clust = dd.LocalCluster(n_workers=1, threads_per_worker=1, scheduler_port=0)
    print()
    print(clust)
    cl = dd.Client(clust)
    print(cl)

    with ds.create_vector('v1', '/tmp/v1.shp', **V_META).delete:
        with ds.create_raster('r1', '/tmp/t1.shp', fp, float, 1).delete:
            ds.create_raster('r2', '', fp, float, 1, driver='MEM')
            ds.create_recipe_raster('r3', pxfn, fp, float)
            ds.create_vector('v2',**MEMV_META)

            ds.v1.insert_data((0, 1), [str(oldid)])
            ds.v2.insert_data((0, 1), [str(oldid)])
            ds.r1.fill(oldid)
            ds.r2.fill(oldid)

            ds.deactivate_all()
            cl.submit(slave).result()
            assert ds.v1.get_data(1)[1] == '42'
            assert (ds.r1.get_data() == 42).all()
Exemple #19
0
    def connect(cls, scheduler_uri, *args, **kwargs):
        """
        Connect to a remote dask scheduler

        Returns
        -------
        DaskJobExecutor
            the connected JobExecutor
        """
        client = dd.Client(address=scheduler_uri)
        return cls(client=client, is_local=False, *args, **kwargs)
Exemple #20
0
 def test_parallel(self, testcluster):
     # repeat selected test w/parallel processing engine
     client = dd.Client(testcluster)
     par_tests = [
         "test_relative_array_padding",
         "test_absolute_nextpow2_array_padding", "test_object_padding",
         "test_dataselection"
     ]
     for test in par_tests:
         getattr(self, test)()
     client.close()
Exemple #21
0
def create_client():
    """ Initializes a `dask.distributed` client for local computing.
    """
    cores = len(os.sched_getaffinity(0))
    nworkers = int(np.ceil(cores / 8))
    cluster = distributed.LocalCluster(n_workers=nworkers,
                                       threads_per_worker=min(cores, 8),
                                       resources={'foo': nworkers})
    print(cluster)
    print(cluster.dashboard_link)
    client = distributed.Client(cluster)
    return client
Exemple #22
0
 def __init__(self, scheduler_uri=None, client=None, is_local=False):
     self.is_local = is_local
     if client is not None:
         if scheduler_uri:
             raise ValueError(
                 "pass either client or scheduler_uri, not both")
         self.client = client
     else:
         if client:
             raise ValueError(
                 "pass either client or scheduler_uri, not both")
         self.client = dd.Client(scheduler_uri, processes=False)
    def compute_reconstruction_error(self, lams_reg, gnd_truth):
        """Compute the reconstructions for each regularization weight error against the ground truth.

        Parameters
        ----------
        lams_reg : numpy.array_like
            List of regularization weights.
        gnd_truth : numpy.array_like
            Expected reconstruction.

        Returns
        -------
        err_l1 : numpy.array_like
            l1-norm errors for each reconstruction.
        err_l2 : numpy.array_like
            l2-norm errors for each reconstruction.
        """
        if self.verbose:
            print("Computing reconstruction error:")
            print("- Regularization weights range: [%g, %g] in %d steps" % (lams_reg[0], lams_reg[-1], len(lams_reg)))
            c = tm.time()

        err_l1 = np.empty((len(lams_reg),))
        err_l2 = np.empty((len(lams_reg),))

        if self.parallel_eval:
            client = dd.Client()
            r = client.map(self.compute_reconstruction_and_loss, lams_reg)

        for ii_l, l in enumerate(lams_reg):
            if self.parallel_eval:
                _, rec = r[ii_l].result()
            else:
                _, rec = self.compute_reconstruction_and_loss(l)
            residual = np.abs(gnd_truth - rec)
            err_l1[ii_l] = np.linalg.norm(residual.ravel(), ord=1)
            err_l2[ii_l] = np.linalg.norm(residual.ravel(), ord=2) ** 2

        if self.verbose:
            print("Done in %g seconds.\n" % (tm.time() - c))

        if self.plot_result:
            f, axs = plt.subplots(2, 1, sharex=True)
            axs[0].set_xscale("log", nonpositive="clip")
            axs[0].plot(lams_reg, err_l1, label="Error - l1-norm")
            axs[0].legend()
            axs[1].set_xscale("log", nonpositive="clip")
            axs[1].plot(lams_reg, err_l2, label="Error - l2-norm ^ 2")
            axs[1].legend()
            plt.show(block=False)

        return err_l1, err_l2
Exemple #24
0
    def connect(cls, *args, **kwargs):
        """
        Setup local cluster
        """

        kwargs["n_workers"] = kwargs.pop("n_workers", cls.processes)
        kwargs["threads_per_worker"] = kwargs.pop("threads_per_worker", 1)
        kwargs["processes"] = kwargs.pop("processes", True)
        kwargs["local_dir"] = kwargs.pop("local_dir", cls.local_dir)

        cls.local_cluster = distributed.LocalCluster(*args, **kwargs)
        cls.client = distributed.Client(cls.local_cluster)
        return True
    def test_corr_parallel(self, testcluster=None):

        ppl.ioff()
        client = dd.Client(testcluster)
        all_tests = [
            attr for attr in self.__dir__() if
            (inspect.ismethod(getattr(self, attr)) and 'parallel' not in attr)
        ]

        for test in all_tests:
            test_method = getattr(self, test)
            test_method()
        client.close()
        ppl.ion()
    def test_dask_unchunked_input(self):
        p = da.from_array(self.p_def)
        t = da.from_array(self.t_def)
        q = da.from_array(self.q_def)

        # Start dask cluster
        cluster = dd.LocalCluster(n_workers=3, threads_per_worker=2)
        print(cluster.dashboard_link)
        client = dd.Client(cluster)

        out = map_blocks(relhum, t, q, p).compute()

        assert np.allclose(out, self.rh_gt_2, atol=0.1)

        client.shutdown()
    def test_dask_chunked_input(self):
        p = da.from_array(self.p_def, chunks="auto")
        t = da.from_array(self.t_def, chunks="auto")
        q = da.from_array(self.q_def, chunks="auto")

        # Start dask cluster
        cluster = dd.LocalCluster(n_workers=3, threads_per_worker=2)
        print(cluster.dashboard_link)
        client = dd.Client(cluster)

        out = client.submit(relhum, t, q, p).result()

        assert np.allclose(out, self.rh_gt_2, atol=0.1)

        client.shutdown()
Exemple #28
0
    def test_dask_chunked_input(self):
        tk = da.from_array(np.asarray(t_def) + 273.15, chunks="auto")
        rh = da.from_array(rh_def, chunks="auto")

        # Start dask cluster
        cluster = dd.LocalCluster(n_workers=3, threads_per_worker=2)
        print(cluster.dashboard_link)
        client = dd.Client(cluster)

        out = map_blocks(dewtemp, tk, rh).compute()

        assert np.allclose(out - 273.15, dt_2, atol=0.1)

        cluster.close()
        client.close()
Exemple #29
0
def flush_local_cluster(testcluster, timeout=10):
    """
    Resets a parallel computing client to avoid memory spilling
    """
    if isinstance(testcluster, dd.LocalCluster):
        # client.restart()
        client = dd.get_client()
        client.close()
        time.sleep(1.0)
        client = dd.Client(testcluster)
        waiting = 0
        while len([w["memory_limit"] for w in testcluster.scheduler_info["workers"].values()]) == 0 \
            and waiting < timeout:
            time.sleep(1.0)
            waiting += 1
    return
Exemple #30
0
    def make_local(cls, cluster_kwargs=None, client_kwargs=None):
        """
        Spin up a local dask cluster

        interesting cluster_kwargs:
            threads_per_worker
            n_workers

        Returns
        -------
        DaskJobExecutor
            the connected JobExecutor
        """
        cluster = dd.LocalCluster(**(cluster_kwargs or {}))
        client = dd.Client(cluster, **(client_kwargs or {}))
        return cls(client=client, is_local=True)