def f(x, release=True): sem = Semaphore(name="x") if not sem.acquire(timeout=0.1): return False if release: assert sem.release() is True return True
def test_worker_dies(): with cluster(config={ "distributed.scheduler.locks.lease-timeout": "0.1s", }) as (scheduler, workers): with Client(scheduler["address"]) as client: sem = Semaphore(name="x", max_leases=1) def f(x, sem, kill_address): with sem: from distributed.worker import get_worker worker = get_worker() if worker.address == kill_address: import os os.kill(os.getpid(), 15) return x futures = client.map(f, range(10), sem=sem, kill_address=workers[0]["address"]) results = client.gather(futures) assert sorted(results) == list(range(10))
def _find_identifiers(physical_data_source: str, table_name: str, columns_to_consider: List[str], params: Configuration) -> Set[str]: _task_start = time.perf_counter() # Check if the scheduler is available if getattr(get_client(timeout=30), 'scheduler', None) is None: raise ServiceUnavailable(physical_data_source, table_name, columns_to_consider) # Limit the number of processed tables via a semaphore, this is mainly used to control the workers' memory usage # as the sampled tables are submitted to each worker's memory for faster processing (less ser/de overhead). # Keep in mind that a table with 8 columns and 10000 rows roughly takes up 150-250MB of memory. with Semaphore(max_leases=params.table_retrieval_limit, name='table_retrieval_limit'): logger.info( f'Starting fetching and sampling {physical_data_source}.{table_name} with {columns_to_consider}' ) table_sample = PhysicalDataSourceSampler(params).sample( physical_data_source, table_name, columns_to_consider) logger.info( f'Fetching and sampling done {physical_data_source}.{table_name} in {time.perf_counter() - _task_start:.2f}' ) with Semaphore(max_leases=params.table_processing_limit, name='table_processing_limit'): logger.info( f'Running the identifier parser on {physical_data_source}.{table_name}' ) _task_start = time.perf_counter() identifiers = IdentifierParser(params).parse(table_sample) Measure.histogram( 'idparser_runtime', tags={ 'physical_data_source': physical_data_source, 'table_name': table_name, 'num_columns': len(columns_to_consider), 'num_ids': len(identifiers), }, )(time.perf_counter() - _task_start) return identifiers
def test_close_sync(client): sem = Semaphore() sem.close() with pytest.raises(RuntimeError, match="Semaphore .* not known or already closed."): sem.acquire()
def test_threadpoolworkers_pick_correct_ioloop(cleanup): # gh4057 # About picking appropriate values for the various timings # * Sleep time in `access_limited` impacts test runtime but is arbitrary # * `lease-timeout` should be smaller than the sleep time. This is what the # test builds on. assuming the leases cannot be refreshed, e.g. wrong # event loop picked / PeriodicCallback never scheduled, the semaphore # would become oversubscribed and the len(protected_resources) becomes # non zero. This should also trigger a log message about "unknown leases" # and fails the test. # * `lease-validation-interval` interval should be the smallest quantity. # How often leases are checked for staleness is hard coded atm and a fifth # of the `lease-timeout`. Accounting for this and some jitter, this should # be sufficiently small to ensure smooth operation. with dask.config.set({ "distributed.scheduler.locks.lease-validation-interval": 0.01, "distributed.scheduler.locks.lease-timeout": 0.1, }): with Client(processes=False, dashboard_address=":0", threads_per_worker=4) as client: sem = Semaphore(max_leases=1, name="database") protected_resource = [] def access_limited(val, sem): import time with sem: assert len(protected_resource) == 0 protected_resource.append(val) # Interact with the DB time.sleep(0.2) protected_resource.remove(val) client.gather(client.map(access_limited, range(10), sem=sem))
def test_threadpoolworkers_pick_correct_ioloop(cleanup): # gh4057 with dask.config.set({ "distributed.scheduler.locks.lease-validation-interval": 0.01, "distributed.scheduler.locks.lease-timeout": 0.05, }): with Client(processes=False, threads_per_worker=4) as client: sem = Semaphore(max_leases=1, name="database") protected_ressource = [] def access_limited(val, sem): import time with sem: assert len(protected_ressource) == 0 protected_ressource.append(val) # Interact with the DB time.sleep(0.1) protected_ressource.remove(val) client.gather(client.map(access_limited, range(10), sem=sem))
def test_timeout_sync(client): s = Semaphore(name="x") # Using the context manager already acquires a lease, so the line below won't be able to acquire another one with s: assert s.acquire(timeout=0.025) is False