def test_imread_use_dask_false(resources_dir): # Load image as delayed dask array then as numpy array # Check computed task count with dask_utils.cluster_and_client(processes=False) as (cluster, client): # Get filepath f = resources_dir / BIG_OME_FILE # Check that there are no open file pointers after init proc = Process() assert str(f) not in [f.path for f in proc.open_files()] # Check that a client does exist get_client() # Don't use dask for reads use_dask(False) # Read image without dask img = AICSImage(f) assert img.data.shape == (3, 1, 3, 5, 325, 475) # Check that the file was read with base reader then rechunked with dask # Normally the task count for this file is 90 assert len(optimize(img.dask_data)[0].__dask_graph__()) == 3 # Check that there are no open file pointers after basics assert str(f) not in [f.path for f in proc.open_files()]
def running_on_dask() -> bool: try: from distributed import get_client get_client() return True except (ImportError, ValueError): return False
def get_indices(cls, axis, partitions, index_func): """This gets the internal indices stored in the partitions. Note: These are the global indices of the object. This is mostly useful when you have deleted rows/columns internally, but do not know which ones were deleted. Args: axis: This axis to extract the labels. (0 - index, 1 - columns). index_func: The function to be used to extract the function. old_blocks: An optional previous object that this object was created from. This is used to compute the correct offsets. Returns: A Pandas Index object. """ client = get_client() ErrorMessage.catch_bugs_and_request_email(not callable(index_func)) func = cls.preprocess_func(index_func) if axis == 0: # We grab the first column of blocks and extract the indices new_idx = ([idx.apply(func).future for idx in partitions.T[0]] if len(partitions.T) else []) else: new_idx = ([idx.apply(func).future for idx in partitions[0]] if len(partitions) else []) new_idx = client.gather(new_idx) return new_idx[0].append(new_idx[1:]) if len(new_idx) else new_idx
def create_experiment_qobj(factory, weights, theta_start, theta_end, theta_step, pm, device, qobj_id=None, use_dask=False, other_arguments=None): # type: (Callable[[List[float], float, dict], QuantumCircuit], List[float], float, float, float, PassManager, BaseBackend, Optional[str], bool, Optional[dict]) -> Tuple[Qobj, List[float]] LOG.info("Creating Qobj with {}".format( { 'factory': factory, 'weights': weights, 'theta_start': theta_start, 'theta_end': theta_end, 'theta_step': theta_step, 'pm': str(pm), 'device': str(device), 'qobj_id': qobj_id, 'use_dask': use_dask, 'other_arguments': other_arguments } )) r = np.arange(theta_start, theta_end, theta_step) if use_dask: client = get_client() # type: Client futures = [client.submit(retrieve_compiled_circuit, weights, theta, factory, pm, device, other_arguments) for theta in r] circuits = client.gather(futures) else: circuits = [retrieve_compiled_circuit(weights, theta, factory, pm, device, other_arguments) for theta in r] LOG.debug(len(r) * 5) # noinspection PyTypeChecker qobj = qiskit.compiler.assemble(circuits, backend_name=qasm_simulator().name(), shots=8192, max_credits=len(r) * 5, qobj_id=qobj_id) return qobj, list(r.tolist())
def save(self, commit=True): plugin = self.cleaned_data["plugin"] bash_script = None reqs_script = False py_name = None plugin_folder = Path(settings.VOLATILITY_PLUGIN_PATH) tmp_folder = plugin_folder / str(uuid.uuid4()) os.mkdir(tmp_folder) with zipfile.ZipFile(plugin.file.path, "r") as f: for name in f.namelist(): if name.endswith(".sh"): bash_script = f.read(name) elif name.lower() == "requirements.txt": reqs_script = True with open(tmp_folder / "requirements.txt", "wb") as reqs: reqs.write(f.read(name)) elif name.endswith(".py"): with open(plugin_folder / name, "wb") as reqs: reqs.write(f.read(name)) py_name = Path(name).stem if bash_script: os.system(shlex.quote("apt update")) os.system(shlex.quote(bash_script)) if reqs_script: os.system( shlex.quote( "pip install -r {}/requirements.txt".format(tmp_folder))) _ = contexts.Context() _ = framework.import_files(volatility3.plugins, True) available_plugins = framework.list_plugins() for plugin in available_plugins: if plugin.startswith("custom.{}".format(py_name)): self.cleaned_data["name"] = plugin def install(bash_script, reqs_script, tmp_folder): if bash_script: os.system(shlex.quote("apt update")) os.system(shlex.quote(bash_script)) if reqs_script: os.system( shlex.quote("pip install -r {}/requirements.txt".format( tmp_folder))) os.system(shlex.quote("rm -rf {}".format(tmp_folder))) dask_client = get_client(address="tcp://scheduler:8786") dask_client.run(install, bash_script, reqs_script, tmp_folder) plugin = super(PluginCreateAdminForm, self).save(commit=commit) for available_plugin in available_plugins: if available_plugin.startswith("custom.{}".format(py_name)): plugin.name = available_plugin plugin.save() return plugin
def rechunk(original, split, final, split_chunks=None): """ Rechunk a dataset. """ a = da.from_zarr(original) chunks = {i: "auto" for i in range(a.ndim)} chunks[0] = -1 if split_chunks is None: chunksize = a.rechunk(chunks).chunks[1][0] split_chunks = (chunksize, ) * (a.ndim - 1) client = get_client() fs = client.map( split_and_store, list(range(a.numblocks[0])), src=original, dst=split, split_chunks=split_chunks, ) wait(fs) n = np.prod(da.from_zarr(split).numblocks[1:]) fs = client.map(merge_and_store, range(n), src=split, dst=final) wait(fs) return da.from_zarr(final)
def __init__(self, name, worker=None, client=None): if worker is None and client is None: from distributed import get_worker, get_client try: worker = get_worker() except Exception: client = get_client() self.subscribers = dict() self.worker = worker self.client = client assert client or worker if self.worker: self.scheduler = self.worker.scheduler self.loop = self.worker.loop elif self.client: self.scheduler = self.client.scheduler self.loop = self.client.loop self.name = name self._started = False self._buffer = [] self.loop.add_callback(self._start) if self.worker: pubsub = self.worker.extensions["pubsub"] self.loop.add_callback(pubsub.publishers[name].add, self) weakref.finalize(self, pubsub.trigger_cleanup)
def __init__(self, executable: str, configuration: dict, filemanager: str = None, num_workers: int = 1, threads_per_worker: int = OMP_NUM_THREADS, scheduler: str = None): """ Create a new Ensemble object. The API mirrors that of the Simulation object. """ self._status = 'Initialized' self.executable: str = executable self.filemanager: str = filemanager self.configuration: dict = configuration self.num_workers: int = num_workers self.simulations: dict = {} self.submissions: list = [] # Try to get a client, and if none exists then start a new one try: self._client = get_client() # Start more workers if necessary: workers = len(self._client.get_worker_logs()) if workers <= self.num_workers: self._client.cluster.scale(workers) except ValueError: self._client = Client(n_workers=self.num_workers, threads_per_worker=threads_per_worker) self._generate_simulation_objects()
def __init__(self, delegate_config: DaskDelegateConfig): super() self.delegate_config = delegate_config self.cache_provider = self.delegate_config.cache_provider # Attempt to load the global Dask client. try: self.client = get_client() except ValueError as _: if self.delegate_config.kube_cluster is not None: self.client = Client(self.delegate_config.kube_cluster) print(self.delegate_config.kube_cluster) else: self.client = Client(f"{self.delegate_config.dask_cluster_address}:{self.delegate_config.dask_cluster_port}") # Setup functions to be run on the schedule. def __scheduler_job_exists(dask_scheduler, job_id: str) -> bool: return job_id in dask_scheduler.tasks def __scheduler_job_state(dask_scheduler, job_id: str) -> TaskState: return dask_scheduler.tasks[job_id].state self.scheduler_job_exists = __scheduler_job_exists self.scheduler_job_state = __scheduler_job_state
def __init__(self, name, worker=None, client=None): if worker is None and client is None: from distributed.worker import get_worker, get_client try: worker = get_worker() except Exception: client = get_client() self.worker = worker self.client = client if self.worker: self.loop = self.worker.loop elif self.client: self.loop = self.client.loop self.name = name self.buffer = deque() if self.worker: pubsub = self.worker.extensions["pubsub"] elif self.client: pubsub = self.client.extensions["pubsub"] self.loop.add_callback(pubsub.subscribers[name].add, self) msg = {"op": "pubsub-add-subscriber", "name": self.name} if self.worker: self.loop.add_callback(self.worker.batched_stream.send, msg) elif self.client: self.loop.add_callback(self.client.scheduler_comm.send, msg) else: raise Exception() weakref.finalize(self, pubsub.trigger_cleanup)
def __init__(self, scheduler_host=None, scatter=None, client=None, loop=None, **submit_kwargs): if client is None: if scheduler_host: client = Client(scheduler_host, loop=loop, set_as_default=False) else: try: client = get_client() except ValueError: msg = ("To use Joblib with Dask first create a Dask Client" "\n\n" " from dask.distributed import Client\n" " client = Client()\n" "or\n" " client = Client('scheduler-address:8786')") raise ValueError(msg) self.client = client if scatter is not None and not isinstance(scatter, (list, tuple)): raise TypeError("scatter must be a list/tuple, got " "`%s`" % type(scatter).__name__) if scatter is not None and len(scatter) > 0: # Keep a reference to the scattered data to keep the ids the same self._scatter = list(scatter) scattered = self.client.scatter(scatter, broadcast=True) self.data_futures = {id(x): f for x, f in zip(scatter, scattered)} else: self._scatter = [] self.data_futures = {} self.task_futures = set() self.submit_kwargs = submit_kwargs
def flatten_parallel(self, loops, interpolated_loop_coordinates, emission_model=None): """ Compute intensity counts in parallel with Dask, save as a "flattened" column """ # Setup scheduler client = distributed.get_client() start_indices = np.insert(np.array( [s.shape[0] for s in interpolated_loop_coordinates]).cumsum()[:-1], 0, 0) if emission_model is None: raise ValueError('Emission Model required') futures = {} for channel in self.channels: # Flatten emissivities for appropriate channel flat_emiss = client.scatter(self.flatten_emissivities(channel, emission_model)) # Build partials for functions #partial_counts = toolz.curry(self.calculate_counts)( # channel, emission_model=emission_model, flattened_emissivities=flat_emiss) #partial_write = toolz.curry(self.write_to_hdf5)(dset_name=channel['name']) # Map functions to iterables futures[channel['name']] = [] for i,loop in enumerate(loops): y = client.submit(self.calculate_counts, channel, loop, emission_model, flat_emiss, pure=False) interp_y = client.submit(self.interpolate, y, loop, pure=False) futures[channel['name']].append( client.submit(self.write_to_hdf5, interp_y, start_indices[i], channel['name'], pure=False)) return futures
def __init__(self, name, worker=None, client=None): if worker is None and client is None: from distributed import get_worker, get_client try: worker = get_worker() except Exception: client = get_client() self.subscribers = dict() self.worker = worker self.client = client assert client or worker if self.worker: self.scheduler = self.worker.scheduler self.loop = self.worker.loop elif self.client: self.scheduler = self.client.scheduler self.loop = self.client.loop self.name = name self._started = False self._buffer = [] self.loop.add_callback(self._start) if self.worker: pubsub = self.worker.extensions['pubsub'] self.loop.add_callback(pubsub.publishers[name].add, self) finalize(self, pubsub.trigger_cleanup)
def __init__(self, name, worker=None, client=None): if worker is None and client is None: from distributed.worker import get_worker, get_client try: worker = get_worker() except Exception: client = get_client() self.worker = worker self.client = client if self.worker: self.loop = self.worker.loop elif self.client: self.loop = self.client.loop self.name = name self.buffer = deque() self.condition = tornado.locks.Condition() if self.worker: pubsub = self.worker.extensions['pubsub'] elif self.client: pubsub = self.client.extensions['pubsub'] self.loop.add_callback(pubsub.subscribers[name].add, self) msg = {'op': 'pubsub-add-subscriber', 'name': self.name} if self.worker: self.loop.add_callback(self.worker.batched_stream.send, msg) elif self.client: self.loop.add_callback(self.client.scheduler_comm.send, msg) else: raise Exception() finalize(self, pubsub.trigger_cleanup)
def transform(self, raw_documents): params = self.get_params() vocabulary = params.pop("vocabulary") if vocabulary is None: check_is_fitted(self, "vocabulary_") vocabulary = self.vocabulary_ if isinstance(vocabulary, dict): # scatter for the user try: client = get_client() except ValueError: vocabulary_for_transform = dask.delayed(vocabulary) else: (vocabulary_for_transform, ) = client.scatter((vocabulary, ), broadcast=True) else: vocabulary_for_transform = vocabulary n_features = vocabulary_length(vocabulary_for_transform) transformed = raw_documents.map_partitions(_count_vectorizer_transform, vocabulary_for_transform, params) meta = scipy.sparse.eye(0, format="csr", dtype=self.dtype) return build_array(transformed, n_features, meta)
def _maybe_run(event_name: str, fn: Callable, *args: Any, **kwargs: Any) -> Any: """Check if the task should run against a `distributed.Event` before starting the task. This offers stronger guarantees than distributed's current cancellation mechanism, which only cancels pending tasks.""" import dask from distributed import Event, get_client try: # Explicitly pass in the timeout from dask's config_dict. Some versions of # distributed hardcode this rather than using the value from the # config_dict. Can be removed once we bump our min requirements for # distributed to >= 2.31.0. timeout = dask.config.get("distributed.comm.timeouts.connect") event = Event(event_name, client=get_client(timeout=timeout)) should_run = event.is_set() except Exception: # Failure to create an event is usually due to connection errors. These # are either due to flaky behavior in distributed's comms under high # loads, or due to the scheduler shutting down. Either way, the safest # course here is to assume we *should* run the task still. If we guess # wrong, we're either doing a bit of unnecessary work, or the cluster # is shutting down and the task will be cancelled anyway. should_run = True if should_run: return fn(*args, **kwargs)
def f(x): with Lock('x') as lock: client = get_client() assert client.get_metadata('locked') is False client.set_metadata('locked', True) sleep(0.05) assert client.get_metadata('locked') is True client.set_metadata('locked', False)
def f(x): with Lock('x') as lock: client = get_client() assert client.get_metadata('locked') == False client.set_metadata('locked', True) sleep(0.05) assert client.get_metadata('locked') == True client.set_metadata('locked', False)
def f(_): client = get_client() with MultiLock(names=["x"]): assert client.get_metadata("locked") is False client.set_metadata("locked", True) sleep(0.05) assert client.get_metadata("locked") is True client.set_metadata("locked", False)
def f(x): with Lock("x") as lock: client = get_client() assert client.get_metadata("locked") is False client.set_metadata("locked", True) sleep(0.05) assert client.get_metadata("locked") is True client.set_metadata("locked", False)
def __init__(self, executable: str, filemanager: str, num_workers: int = 1, threads_per_worker: int = OMP_NUM_THREADS, chunk_size: int = None, num_chunks: int = None, scheduler: str = None, client: Client = None): """ Initialize a new distributed object Parameters ---------- executable: Path to the SUMMA executable filemanager: Path to the file manager num_workers: Number of workers to use for parallel runs threads_per_worker: Number of threads each worker has chunk_size: Number of GRU per job (cannot be used with num_chunks) num_chunks: How many jobs to split the run into (Cannot be used with chunk_size) scheduler: Not used currently """ self._status = 'Initialized' self.executable = executable self.manager_path = Path(os.path.abspath( os.path.realpath(filemanager))) self.manager = FileManager(self.manager_path.parent, self.manager_path.name) self.simulations: Dict[str, Simulation] = {} self.submissions: List = [] self.num_workers: int = num_workers # Try to get a client, and if none exists then start a new one if client: self._client = client workers = len(self._client.get_worker_logs()) if workers <= self.num_workers: self._client.cluster.scale(workers) else: try: self._client = get_client() # Start more workers if necessary: workers = len(self._client.get_worker_logs()) if workers <= self.num_workers: self._client.cluster.scale(workers) except ValueError: self._client = Client(n_workers=self.num_workers, threads_per_worker=threads_per_worker) self.chunk_args = self._generate_args(chunk_size, num_chunks) self._generate_simulation_objects()
async def _compute_tile(self, x: int, y: int, z: int) -> bytes: "Send an XYZ tile to be computed by the distributed client, and wait for it." disp = self.disp client = distributed.get_client() # TODO assert the client's loop is the same as our current event loop. # If not... tell the server to shut down and restart on the new event loop? # (could also do this within a watch loop in `_launch_server`.) tile = geom_utils.xyztile_of_array(disp.arr, x, y, z, interpolation=disp.interpolation, tilesize=disp.tilesize) if tile is None: return empty_tile(disp.tilesize, disp.checkerboard) delayed_png = delayed_arr_to_png( tile.data, range=disp.range, cmap=disp.cmap, checkerboard=disp.checkerboard, ) # TODO `compute` returns before the message has actually been sent, # which throws off our `stats.computing` metric. Would be nice to know # when the scheduler has actually received the message. future = client.compute(delayed_png, sync=False) future = cast(distributed.Future, future) self.stats.computing += 1 awaitable = future if client.asynchronous else future._result() # ^ sneak into the async api if the client isn't set up to be async. # this _should_ be okay, since we're running within the client's own event loop. awaitable = cast(Awaitable[bytes], awaitable) try: result = await awaitable self.stats.completed += 1 return result except Exception: # Typically an `asyncio.CancelledError` from the request being cancelled, # but no matter what it is, we want to ensure we drop the distributed Future. # There may be some state issues in `distributed.Client` around handling `CancelledError`s; # occasionally there are still references to them held within frames/tracebacks (this may also # have to do with aiohttp's error handing, our ours, or ipython). # So we very aggressively try to cancel and release references to the future. try: await future.cancel(asynchronous=True) except asyncio.CancelledError: # Unlikely, but anytime we `await`, we could get cancelled. # We're already cleaning up, so ignore cancellation here. pass raise finally: future.release() self.stats.computing -= 1
def create_experiment_list_qobj(factory, weights, theta_start, theta_end, theta_step, pm, device, qobj_id=None, use_dask=False, other_arguments=None): # type: (Callable[[List[float], float], QuantumCircuit], List[float], float, float, float, PassManager, BaseBackend, Optional[str], bool, Optional[dict]) -> List[Qobj] r = np.arange(theta_start, theta_end, theta_step) if use_dask: client = get_client() # type: Client futures = [client.submit(retrieve_compiled_circuit, theta, factory, pm, device, other_arguments) for theta in r] circuits = client.gather(futures) else: circuits = [retrieve_compiled_circuit(weights, theta, factory, pm, device, other_arguments) for theta in r] LOG.debug(len(r) * 5) if use_dask: client = get_client() # type: Client qobjs = client.gather([client.submit(qiskit.compiler.assemble, c, backend_name=qasm_simulator().name(), shots=8192, max_credits=len(r) * 5, qobj_id=qobj_id) for c in circuits]) else: # noinspection PyTypeChecker qobjs = [qiskit.compiler.assemble(c, backend_name=qasm_simulator().name(), shots=8192, max_credits=len(r) * 5, qobj_id=qobj_id) for c in circuits] return qobjs
def test_reader_context_manager(resources_dir): from aicsimageio.readers import CziReader # Ensure that no dask cluster or client is available before with pytest.raises(ValueError): get_client() # Load the image in a context manager that spawn and closes a cluster and client # Processes = False informs dask to use threads instead of processes # We must use threads here to make sure we can properly run codecov with CziReader(resources_dir / "s_3_t_1_c_3_z_5.czi", dask_kwargs={"processes": False}) as reader: assert get_client() is not None assert reader.data.shape == (1, 3, 3, 5, 325, 475) # Ensure that no dask cluster or client is available after with pytest.raises(ValueError): get_client()
def calculate_ionization_fraction(field, emission_model, **kwargs): """ Solve the time-dependent ionization balance equation for all loops and all elements This method computes the time dependent ion population fractions for each element in the emission model and each loop in the active region and compiles the results to a single HDF5 file. To do this efficiently, it uses the dask.distributed library to take advantage of multiple processes/cores/machines and compute the population fractions in parallel. It returns an asynchronous `~distributed.client.Future` object which holds the state of the submitted tasks. Parameters ---------- field : `~synthesizAR.Field` emission_model : `~synthesizAR.atomic.EmissionModel` Other Parameters --------------------- temperature : `~astropy.units.Quantity` Returns -------- future : `~distributed.client.Future` """ client = distributed.get_client() tmpdir = os.path.join( os.path.dirname(emission_model.ionization_fraction_savefile), 'tmp_nei') if not os.path.exists(tmpdir): os.makedirs(tmpdir) unique_elements = list( set([ion.element_name for ion in emission_model])) temperature = kwargs.get('temperature', emission_model.temperature) el_futures = [] for el_name in unique_elements: el = Element(el_name, temperature) rate_matrix = el._rate_matrix() ioneq = el.equilibrium_ionization(rate_matrix) partial_nei = toolz.curry(EbtelInterface.compute_and_save_nei)( el, rate_matrix=rate_matrix, initial_condition=ioneq, save_dir=tmpdir) loop_futures = client.map(partial_nei, field.loops) distributed.wait(loop_futures) el_futures += loop_futures store_future = client.submit( EbtelInterface.slice_and_store, el_futures, emission_model.ionization_fraction_savefile) future = client.submit(EbtelInterface._cleanup, store_future) return future
def test_dont_select_closed_worker(): # Make sure distributed does not try to reuse a client from a # closed cluster (https://github.com/dask/distributed/issues/2840). with clean(threads=False): cluster = LocalCluster(n_workers=0) c = Client(cluster) cluster.scale(2) assert c == get_client() c.close() cluster.close() cluster2 = LocalCluster(n_workers=0) c2 = Client(cluster2) cluster2.scale(2) current_client = get_client() assert c2 == current_client cluster2.close() c2.close()
def execute(order: JobOrder) -> Result: t = tasks.find("test-task", "1.0") dsk = { 'say-1': (t.execute, 'Hello 1 {}'.format(datetime.now())), 'say-2': (t.execute, 'Hello 2 {}'.format(datetime.now())), 'say-3': (t.execute, 'Hello 3 {}'.format(datetime.now())), 'collect': ['say-1', 'say-2', 'say-3'] } r = get_client().get(dsk, 'collect') f = write(r[0], r[1], r[2]) return Result(exit_code=0, exit_status="OK", outputs=[f])
def extract_ddf_partitions(ddf): """ Returns the mapping: worker -> [list of futures]""" client = get_client() delayed_ddf = ddf.to_delayed() parts = client.compute(delayed_ddf) wait(parts) key_to_part = dict([(str(part.key), part) for part in parts]) ret = defaultdict(list) # Map worker -> [list of futures] for key, workers in client.who_has(parts).items(): worker = first( workers ) # If multiple workers have the part, we pick the first worker ret[worker].append(key_to_part[key]) return ret
def load_loop_simulations(self, interface, filename, **kwargs): """ Load in loop parameters from hydrodynamic results. """ root = zarr.open(store=filename, mode='w', **kwargs) for loop in self.loops: loop.model_results_filename = filename client = distributed.get_client() status = client.map( self._load_loop_simulation, self.loops, root=root, interface=interface, ) return status
def dask_incref(cls, csr): def shared_csr_loader_incref(x): # This does nothing. Exists only to trick scheduler into generating an event pass key = distributed.get_worker().get_current_task() client = distributed.get_client() for shm in [csr.pointers_shm, csr.indices_shm, csr.values_shm]: task_name = f"{cls.REFCOUNT_TAG}:{key}:{shm.name}" dummy_arg = key + shm.name client.submit(shared_csr_loader_incref, dummy_arg, key=task_name, pure=False)
def vocabulary_length(vocabulary): if isinstance(vocabulary, dict): return len(vocabulary) elif isinstance(vocabulary, Delayed): try: return len(vocabulary) except TypeError: return len(vocabulary.compute()) elif isinstance(vocabulary, distributed.Future): client = get_client() future = client.submit(len, vocabulary) wait(future) result = future.result() return result else: raise ValueError(f"Unknown vocabulary type {type(vocabulary)}.")
def _find_identifiers(physical_data_source: str, table_name: str, columns_to_consider: List[str], params: Configuration) -> Set[str]: _task_start = time.perf_counter() # Check if the scheduler is available if getattr(get_client(timeout=30), 'scheduler', None) is None: raise ServiceUnavailable(physical_data_source, table_name, columns_to_consider) # Limit the number of processed tables via a semaphore, this is mainly used to control the workers' memory usage # as the sampled tables are submitted to each worker's memory for faster processing (less ser/de overhead). # Keep in mind that a table with 8 columns and 10000 rows roughly takes up 150-250MB of memory. with Semaphore(max_leases=params.table_retrieval_limit, name='table_retrieval_limit'): logger.info( f'Starting fetching and sampling {physical_data_source}.{table_name} with {columns_to_consider}' ) table_sample = PhysicalDataSourceSampler(params).sample( physical_data_source, table_name, columns_to_consider) logger.info( f'Fetching and sampling done {physical_data_source}.{table_name} in {time.perf_counter() - _task_start:.2f}' ) with Semaphore(max_leases=params.table_processing_limit, name='table_processing_limit'): logger.info( f'Running the identifier parser on {physical_data_source}.{table_name}' ) _task_start = time.perf_counter() identifiers = IdentifierParser(params).parse(table_sample) Measure.histogram( 'idparser_runtime', tags={ 'physical_data_source': physical_data_source, 'table_name': table_name, 'num_columns': len(columns_to_consider), 'num_ids': len(identifiers), }, )(time.perf_counter() - _task_start) return identifiers
def __init__(self, scheduler_host=None, scatter=None, client=None, loop=None, wait_for_workers_timeout=10, **submit_kwargs): if distributed is None: msg = ("You are trying to use 'dask' as a joblib parallel backend " "but dask is not installed. Please install dask " "to fix this error.") raise ValueError(msg) if client is None: if scheduler_host: client = Client(scheduler_host, loop=loop, set_as_default=False) else: try: client = get_client() except ValueError: msg = ("To use Joblib with Dask first create a Dask Client" "\n\n" " from dask.distributed import Client\n" " client = Client()\n" "or\n" " client = Client('scheduler-address:8786')") raise ValueError(msg) self.client = client if scatter is not None and not isinstance(scatter, (list, tuple)): raise TypeError("scatter must be a list/tuple, got " "`%s`" % type(scatter).__name__) if scatter is not None and len(scatter) > 0: # Keep a reference to the scattered data to keep the ids the same self._scatter = list(scatter) scattered = self.client.scatter(scatter, broadcast=True) self.data_futures = {id(x): f for x, f in zip(scatter, scattered)} else: self._scatter = [] self.data_futures = {} self.task_futures = set() self.wait_for_workers_timeout = wait_for_workers_timeout self.submit_kwargs = submit_kwargs
def load_coo_to_csr( coo: dd.DataFrame, shape: Tuple[int, int], loader: CSRLoader, row="row", col="col", value="value", client=None, ): """Parallel conversion of COO graph in a Dask dataframe to a CSR graph. The Dask dataframe ``coo`` will be interpreted as a graph in COO format where the row, column, and edge value column names are given by ``row``, ``col``, and ``value``. The dimensions of the final CSR sparse adjacency matrix are given by ``shape``. Creation and management of the target CSR graph object is handled by the ``loader`` class, which must be a subclass of ``CSRLoader``. Note that the algorithm used by this function for parallel translation only makes sense for distributed CSR data structures that can be accessed directly by all Dask workers in the cluster. A loader for a CSR matrix stored in POSIX shared memory is provided as an example (``SharedCSRLoader``) that runs on single system, multi-process Dask clusters. The return value from this function is a Dask future for a CSR object of the type created by ``loader``. """ if client is None: client = distributed.get_client() loader.register_dask_scheduler_plugin(client) coo_desc = COODescriptor(shape, row, col, value) chunks = [ extract_chunk_information(i, part, coo_desc) for i, part in enumerate(coo.partitions) ] plan = build_plan(coo_desc, chunks) empty_csr = allocate_csr(loader, plan) loaded_chunks = [ load_chunk(loader, i, part, plan, empty_csr) for i, part in enumerate(coo.partitions) ] csr = finalize_csr(loader, empty_csr, loaded_chunks) return csr
def use_distributed(c=None): """ Setup nbodykit to work with dask.distributed. This will change the default MPI communicator to MPI.COMM_SELF, such that each nbodykit object only reside on a single MPI rank. This function shall only be used before any nbodykit object is created. Parameters ---------- c : Client the distributed client. If not given, the default client is used. Notice that if you switch a new client then this function must be called again. """ dask.config.set(scheduler="distributed") import distributed key = 'nbodykit_setup_for_distributed' if c is None: c = distributed.get_client() _setup_for_distributed() # use an lock to minimize chances of seeing KeyError from publish_dataset # the error is annoyingly printed to stderr even if we caught it. lock = distributed.Lock(key) locked = lock.acquire(timeout=3) if key not in c.list_datasets(): try: c.publish_dataset(**{key : True}) c.register_worker_callbacks(setup=_setup_for_distributed) except KeyError: # already published, someone else is registering the callback. pass if locked: lock.release()
def func(x): get_client() return
def f(x): cc = get_client() future = cc.submit(inc, x) return future.result()
def f(): client = yield get_client() future = client.submit(inc, 10) result = yield future raise gen.Return(result)
def f(): get_client().submit(inc, 1).result()