def test_imread_use_dask_false(resources_dir):
    # Load image as delayed dask array then as numpy array
    # Check computed task count
    with dask_utils.cluster_and_client(processes=False) as (cluster, client):
        # Get filepath
        f = resources_dir / BIG_OME_FILE

        # Check that there are no open file pointers after init
        proc = Process()
        assert str(f) not in [f.path for f in proc.open_files()]

        # Check that a client does exist
        get_client()

        # Don't use dask for reads
        use_dask(False)

        # Read image without dask
        img = AICSImage(f)
        assert img.data.shape == (3, 1, 3, 5, 325, 475)

        # Check that the file was read with base reader then rechunked with dask
        # Normally the task count for this file is 90
        assert len(optimize(img.dask_data)[0].__dask_graph__()) == 3

    # Check that there are no open file pointers after basics
    assert str(f) not in [f.path for f in proc.open_files()]
Esempio n. 2
0
def running_on_dask() -> bool:
    try:
        from distributed import get_client

        get_client()
        return True
    except (ImportError, ValueError):
        return False
Esempio n. 3
0
    def get_indices(cls, axis, partitions, index_func):
        """This gets the internal indices stored in the partitions.

        Note: These are the global indices of the object. This is mostly useful
            when you have deleted rows/columns internally, but do not know
            which ones were deleted.

        Args:
            axis: This axis to extract the labels. (0 - index, 1 - columns).
            index_func: The function to be used to extract the function.
            old_blocks: An optional previous object that this object was
                created from. This is used to compute the correct offsets.

        Returns:
            A Pandas Index object.
        """
        client = get_client()
        ErrorMessage.catch_bugs_and_request_email(not callable(index_func))
        func = cls.preprocess_func(index_func)
        if axis == 0:
            # We grab the first column of blocks and extract the indices
            new_idx = ([idx.apply(func).future for idx in partitions.T[0]]
                       if len(partitions.T) else [])
        else:
            new_idx = ([idx.apply(func).future
                        for idx in partitions[0]] if len(partitions) else [])
        new_idx = client.gather(new_idx)
        return new_idx[0].append(new_idx[1:]) if len(new_idx) else new_idx
Esempio n. 4
0
def create_experiment_qobj(factory, weights, theta_start, theta_end, theta_step, pm, device, qobj_id=None, use_dask=False, other_arguments=None):
    # type: (Callable[[List[float], float, dict], QuantumCircuit], List[float], float, float, float, PassManager, BaseBackend, Optional[str], bool, Optional[dict]) -> Tuple[Qobj, List[float]]

    LOG.info("Creating Qobj with {}".format(
        {
            'factory': factory,
            'weights': weights,
            'theta_start': theta_start,
            'theta_end': theta_end,
            'theta_step': theta_step,
            'pm': str(pm),
            'device': str(device),
            'qobj_id': qobj_id,
            'use_dask': use_dask,
            'other_arguments': other_arguments
        }
    ))

    r = np.arange(theta_start, theta_end, theta_step)
    if use_dask:
        client = get_client()  # type: Client
        futures = [client.submit(retrieve_compiled_circuit, weights, theta, factory, pm, device, other_arguments) for theta in r]
        circuits = client.gather(futures)
    else:
        circuits = [retrieve_compiled_circuit(weights, theta, factory, pm, device, other_arguments) for theta in r]
    LOG.debug(len(r) * 5)
    # noinspection PyTypeChecker
    qobj = qiskit.compiler.assemble(circuits, backend_name=qasm_simulator().name(), shots=8192, max_credits=len(r) * 5, qobj_id=qobj_id)
    return qobj, list(r.tolist())
Esempio n. 5
0
    def save(self, commit=True):
        plugin = self.cleaned_data["plugin"]

        bash_script = None
        reqs_script = False
        py_name = None

        plugin_folder = Path(settings.VOLATILITY_PLUGIN_PATH)
        tmp_folder = plugin_folder / str(uuid.uuid4())
        os.mkdir(tmp_folder)

        with zipfile.ZipFile(plugin.file.path, "r") as f:
            for name in f.namelist():
                if name.endswith(".sh"):
                    bash_script = f.read(name)
                elif name.lower() == "requirements.txt":
                    reqs_script = True
                    with open(tmp_folder / "requirements.txt", "wb") as reqs:
                        reqs.write(f.read(name))
                elif name.endswith(".py"):
                    with open(plugin_folder / name, "wb") as reqs:
                        reqs.write(f.read(name))
                    py_name = Path(name).stem

        if bash_script:
            os.system(shlex.quote("apt update"))
            os.system(shlex.quote(bash_script))
        if reqs_script:
            os.system(
                shlex.quote(
                    "pip install -r {}/requirements.txt".format(tmp_folder)))

        _ = contexts.Context()
        _ = framework.import_files(volatility3.plugins, True)
        available_plugins = framework.list_plugins()

        for plugin in available_plugins:
            if plugin.startswith("custom.{}".format(py_name)):
                self.cleaned_data["name"] = plugin

        def install(bash_script, reqs_script, tmp_folder):
            if bash_script:
                os.system(shlex.quote("apt update"))
                os.system(shlex.quote(bash_script))
            if reqs_script:
                os.system(
                    shlex.quote("pip install -r {}/requirements.txt".format(
                        tmp_folder)))
                os.system(shlex.quote("rm -rf {}".format(tmp_folder)))

        dask_client = get_client(address="tcp://scheduler:8786")
        dask_client.run(install, bash_script, reqs_script, tmp_folder)
        plugin = super(PluginCreateAdminForm, self).save(commit=commit)

        for available_plugin in available_plugins:
            if available_plugin.startswith("custom.{}".format(py_name)):
                plugin.name = available_plugin
                plugin.save()

        return plugin
Esempio n. 6
0
def rechunk(original, split, final, split_chunks=None):
    """
    Rechunk a dataset.
    """
    a = da.from_zarr(original)
    chunks = {i: "auto" for i in range(a.ndim)}
    chunks[0] = -1
    if split_chunks is None:
        chunksize = a.rechunk(chunks).chunks[1][0]
        split_chunks = (chunksize, ) * (a.ndim - 1)

    client = get_client()
    fs = client.map(
        split_and_store,
        list(range(a.numblocks[0])),
        src=original,
        dst=split,
        split_chunks=split_chunks,
    )
    wait(fs)

    n = np.prod(da.from_zarr(split).numblocks[1:])
    fs = client.map(merge_and_store, range(n), src=split, dst=final)
    wait(fs)
    return da.from_zarr(final)
Esempio n. 7
0
    def __init__(self, name, worker=None, client=None):
        if worker is None and client is None:
            from distributed import get_worker, get_client

            try:
                worker = get_worker()
            except Exception:
                client = get_client()

        self.subscribers = dict()
        self.worker = worker
        self.client = client
        assert client or worker
        if self.worker:
            self.scheduler = self.worker.scheduler
            self.loop = self.worker.loop
        elif self.client:
            self.scheduler = self.client.scheduler
            self.loop = self.client.loop

        self.name = name
        self._started = False
        self._buffer = []

        self.loop.add_callback(self._start)

        if self.worker:
            pubsub = self.worker.extensions["pubsub"]
            self.loop.add_callback(pubsub.publishers[name].add, self)
            weakref.finalize(self, pubsub.trigger_cleanup)
Esempio n. 8
0
 def __init__(self,
              executable: str,
              configuration: dict,
              filemanager: str = None,
              num_workers: int = 1,
              threads_per_worker: int = OMP_NUM_THREADS,
              scheduler: str = None):
     """
     Create a new Ensemble object. The API mirrors that of the
     Simulation object.
     """
     self._status = 'Initialized'
     self.executable: str = executable
     self.filemanager: str = filemanager
     self.configuration: dict = configuration
     self.num_workers: int = num_workers
     self.simulations: dict = {}
     self.submissions: list = []
     # Try to get a client, and if none exists then start a new one
     try:
         self._client = get_client()
         # Start more workers if necessary:
         workers = len(self._client.get_worker_logs())
         if workers <= self.num_workers:
             self._client.cluster.scale(workers)
     except ValueError:
         self._client = Client(n_workers=self.num_workers,
                               threads_per_worker=threads_per_worker)
     self._generate_simulation_objects()
Esempio n. 9
0
    def __init__(self, delegate_config: DaskDelegateConfig):
        super()

        self.delegate_config = delegate_config
        self.cache_provider = self.delegate_config.cache_provider

        # Attempt to load the global Dask client.
        try:
            self.client = get_client()

        except ValueError as _:
            if self.delegate_config.kube_cluster is not None:
                self.client = Client(self.delegate_config.kube_cluster)
                print(self.delegate_config.kube_cluster)

            else:
                self.client = Client(f"{self.delegate_config.dask_cluster_address}:{self.delegate_config.dask_cluster_port}")

        # Setup functions to be run on the schedule.
        def __scheduler_job_exists(dask_scheduler, job_id: str) -> bool:
            return job_id in dask_scheduler.tasks

        def __scheduler_job_state(dask_scheduler, job_id: str) -> TaskState:
            return dask_scheduler.tasks[job_id].state

        self.scheduler_job_exists = __scheduler_job_exists
        self.scheduler_job_state = __scheduler_job_state
Esempio n. 10
0
    def __init__(self, name, worker=None, client=None):
        if worker is None and client is None:
            from distributed.worker import get_worker, get_client

            try:
                worker = get_worker()
            except Exception:
                client = get_client()

        self.worker = worker
        self.client = client
        if self.worker:
            self.loop = self.worker.loop
        elif self.client:
            self.loop = self.client.loop
        self.name = name
        self.buffer = deque()

        if self.worker:
            pubsub = self.worker.extensions["pubsub"]
        elif self.client:
            pubsub = self.client.extensions["pubsub"]
        self.loop.add_callback(pubsub.subscribers[name].add, self)

        msg = {"op": "pubsub-add-subscriber", "name": self.name}
        if self.worker:
            self.loop.add_callback(self.worker.batched_stream.send, msg)
        elif self.client:
            self.loop.add_callback(self.client.scheduler_comm.send, msg)
        else:
            raise Exception()

        weakref.finalize(self, pubsub.trigger_cleanup)
Esempio n. 11
0
    def __init__(self, scheduler_host=None, scatter=None,
                 client=None, loop=None, **submit_kwargs):
        if client is None:
            if scheduler_host:
                client = Client(scheduler_host, loop=loop,
                                set_as_default=False)
            else:
                try:
                    client = get_client()
                except ValueError:
                    msg = ("To use Joblib with Dask first create a Dask Client"
                           "\n\n"
                           "    from dask.distributed import Client\n"
                           "    client = Client()\n"
                           "or\n"
                           "    client = Client('scheduler-address:8786')")
                    raise ValueError(msg)

        self.client = client

        if scatter is not None and not isinstance(scatter, (list, tuple)):
            raise TypeError("scatter must be a list/tuple, got "
                            "`%s`" % type(scatter).__name__)

        if scatter is not None and len(scatter) > 0:
            # Keep a reference to the scattered data to keep the ids the same
            self._scatter = list(scatter)
            scattered = self.client.scatter(scatter, broadcast=True)
            self.data_futures = {id(x): f for x, f in zip(scatter, scattered)}
        else:
            self._scatter = []
            self.data_futures = {}
        self.task_futures = set()
        self.submit_kwargs = submit_kwargs
Esempio n. 12
0
    def flatten_parallel(self, loops, interpolated_loop_coordinates, emission_model=None):
        """
        Compute intensity counts in parallel with Dask, save as a "flattened" column
        """
        # Setup scheduler
        client = distributed.get_client()
        start_indices = np.insert(np.array(
            [s.shape[0] for s in interpolated_loop_coordinates]).cumsum()[:-1], 0, 0)
        if emission_model is None:
            raise ValueError('Emission Model required')

        futures = {}
        for channel in self.channels:
            # Flatten emissivities for appropriate channel
            flat_emiss = client.scatter(self.flatten_emissivities(channel, emission_model))
            # Build partials for functions
            #partial_counts = toolz.curry(self.calculate_counts)(
            #    channel, emission_model=emission_model, flattened_emissivities=flat_emiss)
            #partial_write = toolz.curry(self.write_to_hdf5)(dset_name=channel['name'])
            # Map functions to iterables
            futures[channel['name']] = []
            for i,loop in enumerate(loops):
                y = client.submit(self.calculate_counts, channel, loop, emission_model, flat_emiss, pure=False)
                interp_y = client.submit(self.interpolate, y, loop, pure=False)
                futures[channel['name']].append(
                    client.submit(self.write_to_hdf5, interp_y, start_indices[i], channel['name'], pure=False))

        return futures
Esempio n. 13
0
    def __init__(self, name, worker=None, client=None):
        if worker is None and client is None:
            from distributed import get_worker, get_client
            try:
                worker = get_worker()
            except Exception:
                client = get_client()

        self.subscribers = dict()
        self.worker = worker
        self.client = client
        assert client or worker
        if self.worker:
            self.scheduler = self.worker.scheduler
            self.loop = self.worker.loop
        elif self.client:
            self.scheduler = self.client.scheduler
            self.loop = self.client.loop

        self.name = name
        self._started = False
        self._buffer = []

        self.loop.add_callback(self._start)

        if self.worker:
            pubsub = self.worker.extensions['pubsub']
            self.loop.add_callback(pubsub.publishers[name].add, self)
            finalize(self, pubsub.trigger_cleanup)
Esempio n. 14
0
    def __init__(self, name, worker=None, client=None):
        if worker is None and client is None:
            from distributed.worker import get_worker, get_client
            try:
                worker = get_worker()
            except Exception:
                client = get_client()

        self.worker = worker
        self.client = client
        if self.worker:
            self.loop = self.worker.loop
        elif self.client:
            self.loop = self.client.loop
        self.name = name
        self.buffer = deque()
        self.condition = tornado.locks.Condition()

        if self.worker:
            pubsub = self.worker.extensions['pubsub']
        elif self.client:
            pubsub = self.client.extensions['pubsub']
        self.loop.add_callback(pubsub.subscribers[name].add, self)

        msg = {'op': 'pubsub-add-subscriber', 'name': self.name}
        if self.worker:
            self.loop.add_callback(self.worker.batched_stream.send, msg)
        elif self.client:
            self.loop.add_callback(self.client.scheduler_comm.send, msg)
        else:
            raise Exception()

        finalize(self, pubsub.trigger_cleanup)
Esempio n. 15
0
    def transform(self, raw_documents):
        params = self.get_params()
        vocabulary = params.pop("vocabulary")

        if vocabulary is None:
            check_is_fitted(self, "vocabulary_")
            vocabulary = self.vocabulary_

        if isinstance(vocabulary, dict):
            # scatter for the user
            try:
                client = get_client()
            except ValueError:
                vocabulary_for_transform = dask.delayed(vocabulary)
            else:
                (vocabulary_for_transform, ) = client.scatter((vocabulary, ),
                                                              broadcast=True)
        else:
            vocabulary_for_transform = vocabulary

        n_features = vocabulary_length(vocabulary_for_transform)
        transformed = raw_documents.map_partitions(_count_vectorizer_transform,
                                                   vocabulary_for_transform,
                                                   params)
        meta = scipy.sparse.eye(0, format="csr", dtype=self.dtype)
        return build_array(transformed, n_features, meta)
Esempio n. 16
0
    def _maybe_run(event_name: str, fn: Callable, *args: Any,
                   **kwargs: Any) -> Any:
        """Check if the task should run against a `distributed.Event` before
        starting the task. This offers stronger guarantees than distributed's
        current cancellation mechanism, which only cancels pending tasks."""
        import dask
        from distributed import Event, get_client

        try:
            # Explicitly pass in the timeout from dask's config_dict. Some versions of
            # distributed hardcode this rather than using the value from the
            # config_dict.  Can be removed once we bump our min requirements for
            # distributed to >= 2.31.0.
            timeout = dask.config.get("distributed.comm.timeouts.connect")
            event = Event(event_name, client=get_client(timeout=timeout))
            should_run = event.is_set()
        except Exception:
            # Failure to create an event is usually due to connection errors. These
            # are either due to flaky behavior in distributed's comms under high
            # loads, or due to the scheduler shutting down. Either way, the safest
            # course here is to assume we *should* run the task still. If we guess
            # wrong, we're either doing a bit of unnecessary work, or the cluster
            # is shutting down and the task will be cancelled anyway.
            should_run = True

        if should_run:
            return fn(*args, **kwargs)
Esempio n. 17
0
 def f(x):
     with Lock('x') as lock:
         client = get_client()
         assert client.get_metadata('locked') is False
         client.set_metadata('locked', True)
         sleep(0.05)
         assert client.get_metadata('locked') is True
         client.set_metadata('locked', False)
 def f(x):
     with Lock('x') as lock:
         client = get_client()
         assert client.get_metadata('locked') == False
         client.set_metadata('locked', True)
         sleep(0.05)
         assert client.get_metadata('locked') == True
         client.set_metadata('locked', False)
 def f(_):
     client = get_client()
     with MultiLock(names=["x"]):
         assert client.get_metadata("locked") is False
         client.set_metadata("locked", True)
         sleep(0.05)
         assert client.get_metadata("locked") is True
         client.set_metadata("locked", False)
Esempio n. 20
0
 def f(x):
     with Lock("x") as lock:
         client = get_client()
         assert client.get_metadata("locked") is False
         client.set_metadata("locked", True)
         sleep(0.05)
         assert client.get_metadata("locked") is True
         client.set_metadata("locked", False)
Esempio n. 21
0
    def __init__(self,
                 executable: str,
                 filemanager: str,
                 num_workers: int = 1,
                 threads_per_worker: int = OMP_NUM_THREADS,
                 chunk_size: int = None,
                 num_chunks: int = None,
                 scheduler: str = None,
                 client: Client = None):
        """
        Initialize a new distributed object

        Parameters
        ----------
        executable:
            Path to the SUMMA executable
        filemanager:
            Path to the file manager
        num_workers:
            Number of workers to use for parallel runs
        threads_per_worker:
            Number of threads each worker has
        chunk_size:
            Number of GRU per job
            (cannot be used with num_chunks)
        num_chunks:
            How many jobs to split the run into
            (Cannot be used with chunk_size)
        scheduler:
            Not used currently
        """
        self._status = 'Initialized'
        self.executable = executable
        self.manager_path = Path(os.path.abspath(
            os.path.realpath(filemanager)))
        self.manager = FileManager(self.manager_path.parent,
                                   self.manager_path.name)
        self.simulations: Dict[str, Simulation] = {}
        self.submissions: List = []
        self.num_workers: int = num_workers
        # Try to get a client, and if none exists then start a new one
        if client:
            self._client = client
            workers = len(self._client.get_worker_logs())
            if workers <= self.num_workers:
                self._client.cluster.scale(workers)
        else:
            try:
                self._client = get_client()
                # Start more workers if necessary:
                workers = len(self._client.get_worker_logs())
                if workers <= self.num_workers:
                    self._client.cluster.scale(workers)
            except ValueError:
                self._client = Client(n_workers=self.num_workers,
                                      threads_per_worker=threads_per_worker)
        self.chunk_args = self._generate_args(chunk_size, num_chunks)
        self._generate_simulation_objects()
Esempio n. 22
0
    async def _compute_tile(self, x: int, y: int, z: int) -> bytes:
        "Send an XYZ tile to be computed by the distributed client, and wait for it."
        disp = self.disp
        client = distributed.get_client()
        # TODO assert the client's loop is the same as our current event loop.
        # If not... tell the server to shut down and restart on the new event loop?
        # (could also do this within a watch loop in `_launch_server`.)

        tile = geom_utils.xyztile_of_array(disp.arr,
                                           x,
                                           y,
                                           z,
                                           interpolation=disp.interpolation,
                                           tilesize=disp.tilesize)
        if tile is None:
            return empty_tile(disp.tilesize, disp.checkerboard)

        delayed_png = delayed_arr_to_png(
            tile.data,
            range=disp.range,
            cmap=disp.cmap,
            checkerboard=disp.checkerboard,
        )

        # TODO `compute` returns before the message has actually been sent,
        # which throws off our `stats.computing` metric. Would be nice to know
        # when the scheduler has actually received the message.
        future = client.compute(delayed_png, sync=False)
        future = cast(distributed.Future, future)
        self.stats.computing += 1

        awaitable = future if client.asynchronous else future._result()
        # ^ sneak into the async api if the client isn't set up to be async.
        # this _should_ be okay, since we're running within the client's own event loop.
        awaitable = cast(Awaitable[bytes], awaitable)
        try:
            result = await awaitable
            self.stats.completed += 1
            return result
        except Exception:
            # Typically an `asyncio.CancelledError` from the request being cancelled,
            # but no matter what it is, we want to ensure we drop the distributed Future.

            # There may be some state issues in `distributed.Client` around handling `CancelledError`s;
            # occasionally there are still references to them held within frames/tracebacks (this may also
            # have to do with aiohttp's error handing, our ours, or ipython).
            # So we very aggressively try to cancel and release references to the future.
            try:
                await future.cancel(asynchronous=True)
            except asyncio.CancelledError:
                # Unlikely, but anytime we `await`, we could get cancelled.
                # We're already cleaning up, so ignore cancellation here.
                pass

            raise
        finally:
            future.release()
            self.stats.computing -= 1
Esempio n. 23
0
def create_experiment_list_qobj(factory, weights, theta_start, theta_end, theta_step, pm, device, qobj_id=None, use_dask=False, other_arguments=None):
    # type: (Callable[[List[float], float], QuantumCircuit], List[float], float, float, float, PassManager, BaseBackend, Optional[str], bool, Optional[dict]) -> List[Qobj]
    r = np.arange(theta_start, theta_end, theta_step)
    if use_dask:
        client = get_client()  # type: Client
        futures = [client.submit(retrieve_compiled_circuit, theta, factory, pm, device, other_arguments) for theta in r]
        circuits = client.gather(futures)
    else:
        circuits = [retrieve_compiled_circuit(weights, theta, factory, pm, device, other_arguments) for theta in r]
    LOG.debug(len(r) * 5)
    if use_dask:
        client = get_client()  # type: Client
        qobjs = client.gather([client.submit(qiskit.compiler.assemble, c, backend_name=qasm_simulator().name(), shots=8192, max_credits=len(r) * 5, qobj_id=qobj_id)
                 for c in circuits])
    else:
        # noinspection PyTypeChecker
        qobjs = [qiskit.compiler.assemble(c, backend_name=qasm_simulator().name(), shots=8192, max_credits=len(r) * 5, qobj_id=qobj_id) for c in circuits]
    return qobjs
Esempio n. 24
0
def test_reader_context_manager(resources_dir):
    from aicsimageio.readers import CziReader

    # Ensure that no dask cluster or client is available before
    with pytest.raises(ValueError):
        get_client()

    # Load the image in a context manager that spawn and closes a cluster and client
    # Processes = False informs dask to use threads instead of processes
    # We must use threads here to make sure we can properly run codecov
    with CziReader(resources_dir / "s_3_t_1_c_3_z_5.czi",
                   dask_kwargs={"processes": False}) as reader:
        assert get_client() is not None
        assert reader.data.shape == (1, 3, 3, 5, 325, 475)

    # Ensure that no dask cluster or client is available after
    with pytest.raises(ValueError):
        get_client()
Esempio n. 25
0
    def calculate_ionization_fraction(field, emission_model, **kwargs):
        """
        Solve the time-dependent ionization balance equation for all loops and all elements

        This method computes the time dependent ion population fractions for each element in 
        the emission model and each loop in the active region and compiles the results to a single
        HDF5 file. To do this efficiently, it uses the dask.distributed library to take advantage of
        multiple processes/cores/machines and compute the population fractions in parallel. It returns
        an asynchronous `~distributed.client.Future` object which holds the state of the submitted
        tasks.

        Parameters
        ----------
        field : `~synthesizAR.Field`
        emission_model : `~synthesizAR.atomic.EmissionModel`

        Other Parameters
        ---------------------
        temperature : `~astropy.units.Quantity`

        Returns
        --------
        future : `~distributed.client.Future`
        """
        client = distributed.get_client()
        tmpdir = os.path.join(
            os.path.dirname(emission_model.ionization_fraction_savefile),
            'tmp_nei')
        if not os.path.exists(tmpdir):
            os.makedirs(tmpdir)
        unique_elements = list(
            set([ion.element_name for ion in emission_model]))
        temperature = kwargs.get('temperature', emission_model.temperature)

        el_futures = []
        for el_name in unique_elements:
            el = Element(el_name, temperature)
            rate_matrix = el._rate_matrix()
            ioneq = el.equilibrium_ionization(rate_matrix)
            partial_nei = toolz.curry(EbtelInterface.compute_and_save_nei)(
                el,
                rate_matrix=rate_matrix,
                initial_condition=ioneq,
                save_dir=tmpdir)
            loop_futures = client.map(partial_nei, field.loops)
            distributed.wait(loop_futures)
            el_futures += loop_futures

        store_future = client.submit(
            EbtelInterface.slice_and_store, el_futures,
            emission_model.ionization_fraction_savefile)
        future = client.submit(EbtelInterface._cleanup, store_future)

        return future
Esempio n. 26
0
def test_dont_select_closed_worker():
    # Make sure distributed does not try to reuse a client from a
    # closed cluster (https://github.com/dask/distributed/issues/2840).
    with clean(threads=False):
        cluster = LocalCluster(n_workers=0)
        c = Client(cluster)
        cluster.scale(2)
        assert c == get_client()

        c.close()
        cluster.close()

        cluster2 = LocalCluster(n_workers=0)
        c2 = Client(cluster2)
        cluster2.scale(2)

        current_client = get_client()
        assert c2 == current_client

        cluster2.close()
        c2.close()
Esempio n. 27
0
def execute(order: JobOrder) -> Result:
    t = tasks.find("test-task", "1.0")

    dsk = {
        'say-1': (t.execute, 'Hello 1 {}'.format(datetime.now())),
        'say-2': (t.execute, 'Hello 2 {}'.format(datetime.now())),
        'say-3': (t.execute, 'Hello 3 {}'.format(datetime.now())),
        'collect': ['say-1', 'say-2', 'say-3']
    }

    r = get_client().get(dsk, 'collect')
    f = write(r[0], r[1], r[2])
    return Result(exit_code=0, exit_status="OK", outputs=[f])
Esempio n. 28
0
def extract_ddf_partitions(ddf):
    """ Returns the mapping: worker -> [list of futures]"""
    client = get_client()
    delayed_ddf = ddf.to_delayed()
    parts = client.compute(delayed_ddf)
    wait(parts)

    key_to_part = dict([(str(part.key), part) for part in parts])
    ret = defaultdict(list)  # Map worker -> [list of futures]
    for key, workers in client.who_has(parts).items():
        worker = first(
            workers
        )  # If multiple workers have the part, we pick the first worker
        ret[worker].append(key_to_part[key])
    return ret
Esempio n. 29
0
 def load_loop_simulations(self, interface, filename, **kwargs):
     """
     Load in loop parameters from hydrodynamic results.
     """
     root = zarr.open(store=filename, mode='w', **kwargs)
     for loop in self.loops:
         loop.model_results_filename = filename
     client = distributed.get_client()
     status = client.map(
         self._load_loop_simulation,
         self.loops,
         root=root,
         interface=interface,
     )
     return status
Esempio n. 30
0
    def dask_incref(cls, csr):
        def shared_csr_loader_incref(x):
            # This does nothing.  Exists only to trick scheduler into generating an event
            pass

        key = distributed.get_worker().get_current_task()
        client = distributed.get_client()

        for shm in [csr.pointers_shm, csr.indices_shm, csr.values_shm]:
            task_name = f"{cls.REFCOUNT_TAG}:{key}:{shm.name}"
            dummy_arg = key + shm.name
            client.submit(shared_csr_loader_incref,
                          dummy_arg,
                          key=task_name,
                          pure=False)
Esempio n. 31
0
def vocabulary_length(vocabulary):
    if isinstance(vocabulary, dict):
        return len(vocabulary)
    elif isinstance(vocabulary, Delayed):
        try:
            return len(vocabulary)
        except TypeError:
            return len(vocabulary.compute())
    elif isinstance(vocabulary, distributed.Future):
        client = get_client()
        future = client.submit(len, vocabulary)
        wait(future)
        result = future.result()
        return result
    else:
        raise ValueError(f"Unknown vocabulary type {type(vocabulary)}.")
Esempio n. 32
0
def _find_identifiers(physical_data_source: str, table_name: str,
                      columns_to_consider: List[str],
                      params: Configuration) -> Set[str]:
    _task_start = time.perf_counter()

    # Check if the scheduler is available
    if getattr(get_client(timeout=30), 'scheduler', None) is None:
        raise ServiceUnavailable(physical_data_source, table_name,
                                 columns_to_consider)

    # Limit the number of processed tables via a semaphore, this is mainly used to control the workers' memory usage
    # as the sampled tables are submitted to each worker's memory for faster processing (less ser/de overhead).
    # Keep in mind that a table with 8 columns and 10000 rows roughly takes up 150-250MB of memory.
    with Semaphore(max_leases=params.table_retrieval_limit,
                   name='table_retrieval_limit'):
        logger.info(
            f'Starting fetching and sampling {physical_data_source}.{table_name} with {columns_to_consider}'
        )

        table_sample = PhysicalDataSourceSampler(params).sample(
            physical_data_source, table_name, columns_to_consider)

        logger.info(
            f'Fetching and sampling done {physical_data_source}.{table_name} in {time.perf_counter() - _task_start:.2f}'
        )

        with Semaphore(max_leases=params.table_processing_limit,
                       name='table_processing_limit'):
            logger.info(
                f'Running the identifier parser on {physical_data_source}.{table_name}'
            )

            _task_start = time.perf_counter()

            identifiers = IdentifierParser(params).parse(table_sample)

            Measure.histogram(
                'idparser_runtime',
                tags={
                    'physical_data_source': physical_data_source,
                    'table_name': table_name,
                    'num_columns': len(columns_to_consider),
                    'num_ids': len(identifiers),
                },
            )(time.perf_counter() - _task_start)

            return identifiers
Esempio n. 33
0
    def __init__(self,
                 scheduler_host=None,
                 scatter=None,
                 client=None,
                 loop=None,
                 wait_for_workers_timeout=10,
                 **submit_kwargs):
        if distributed is None:
            msg = ("You are trying to use 'dask' as a joblib parallel backend "
                   "but dask is not installed. Please install dask "
                   "to fix this error.")
            raise ValueError(msg)

        if client is None:
            if scheduler_host:
                client = Client(scheduler_host,
                                loop=loop,
                                set_as_default=False)
            else:
                try:
                    client = get_client()
                except ValueError:
                    msg = ("To use Joblib with Dask first create a Dask Client"
                           "\n\n"
                           "    from dask.distributed import Client\n"
                           "    client = Client()\n"
                           "or\n"
                           "    client = Client('scheduler-address:8786')")
                    raise ValueError(msg)

        self.client = client

        if scatter is not None and not isinstance(scatter, (list, tuple)):
            raise TypeError("scatter must be a list/tuple, got "
                            "`%s`" % type(scatter).__name__)

        if scatter is not None and len(scatter) > 0:
            # Keep a reference to the scattered data to keep the ids the same
            self._scatter = list(scatter)
            scattered = self.client.scatter(scatter, broadcast=True)
            self.data_futures = {id(x): f for x, f in zip(scatter, scattered)}
        else:
            self._scatter = []
            self.data_futures = {}
        self.task_futures = set()
        self.wait_for_workers_timeout = wait_for_workers_timeout
        self.submit_kwargs = submit_kwargs
Esempio n. 34
0
def load_coo_to_csr(
    coo: dd.DataFrame,
    shape: Tuple[int, int],
    loader: CSRLoader,
    row="row",
    col="col",
    value="value",
    client=None,
):
    """Parallel conversion of COO graph in a Dask dataframe to a CSR graph.

    The Dask dataframe ``coo`` will be interpreted as a graph in COO format
    where the row, column, and edge value column names are given by ``row``,
    ``col``, and ``value``.  The dimensions of the final CSR sparse adjacency
    matrix are given by ``shape``.

    Creation and management of the target CSR graph object is handled by the
    ``loader`` class, which must be a subclass of ``CSRLoader``.  

    Note that the algorithm used by this function for parallel translation
    only makes sense for distributed CSR data structures that can be accessed
    directly by all Dask workers in the cluster.  A loader for a CSR matrix
    stored in POSIX shared memory is provided as an example
    (``SharedCSRLoader``) that runs on single system, multi-process Dask
    clusters.

    The return value from this function is a Dask future for a CSR object
    of the type created by ``loader``.
    """
    if client is None:
        client = distributed.get_client()
    loader.register_dask_scheduler_plugin(client)

    coo_desc = COODescriptor(shape, row, col, value)
    chunks = [
        extract_chunk_information(i, part, coo_desc)
        for i, part in enumerate(coo.partitions)
    ]
    plan = build_plan(coo_desc, chunks)
    empty_csr = allocate_csr(loader, plan)
    loaded_chunks = [
        load_chunk(loader, i, part, plan, empty_csr)
        for i, part in enumerate(coo.partitions)
    ]
    csr = finalize_csr(loader, empty_csr, loaded_chunks)

    return csr
Esempio n. 35
0
def use_distributed(c=None):
    """ Setup nbodykit to work with dask.distributed.
        This will change the default MPI communicator to MPI.COMM_SELF,
        such that each nbodykit object only reside on a single MPI rank.

        This function shall only be used before any nbodykit object is created.

        Parameters
        ----------
        c : Client
            the distributed client. If not given, the default client is used.
            Notice that if you switch a new client then this function
            must be called again.

    """
    dask.config.set(scheduler="distributed")

    import distributed

    key = 'nbodykit_setup_for_distributed'
    if c is None:
        c = distributed.get_client()

    _setup_for_distributed()

    # use an lock to minimize chances of seeing KeyError from publish_dataset
    # the error is annoyingly printed to stderr even if we caught it.
    lock = distributed.Lock(key)
    locked = lock.acquire(timeout=3)

    if key not in c.list_datasets():
        try:
            c.publish_dataset(**{key : True})
            c.register_worker_callbacks(setup=_setup_for_distributed)
        except KeyError:
            # already published, someone else is registering the callback.
            pass

    if locked:
        lock.release()
Esempio n. 36
0
 def func(x):
     get_client()
     return
Esempio n. 37
0
 def f(x):
     cc = get_client()
     future = cc.submit(inc, x)
     return future.result()
Esempio n. 38
0
 def f():
     client = yield get_client()
     future = client.submit(inc, 10)
     result = yield future
     raise gen.Return(result)
Esempio n. 39
0
 def f():
     get_client().submit(inc, 1).result()