def f(x, release=True):
        sem = Semaphore(name="x")
        if not sem.acquire(timeout=0.1):
            return False
        if release:
            assert sem.release() is True

        return True
def test_worker_dies():
    with cluster(config={
            "distributed.scheduler.locks.lease-timeout": "0.1s",
    }) as (scheduler, workers):
        with Client(scheduler["address"]) as client:
            sem = Semaphore(name="x", max_leases=1)

            def f(x, sem, kill_address):
                with sem:
                    from distributed.worker import get_worker

                    worker = get_worker()
                    if worker.address == kill_address:
                        import os

                        os.kill(os.getpid(), 15)
                    return x

            futures = client.map(f,
                                 range(10),
                                 sem=sem,
                                 kill_address=workers[0]["address"])
            results = client.gather(futures)

            assert sorted(results) == list(range(10))
Exemple #3
0
def _find_identifiers(physical_data_source: str, table_name: str,
                      columns_to_consider: List[str],
                      params: Configuration) -> Set[str]:
    _task_start = time.perf_counter()

    # Check if the scheduler is available
    if getattr(get_client(timeout=30), 'scheduler', None) is None:
        raise ServiceUnavailable(physical_data_source, table_name,
                                 columns_to_consider)

    # Limit the number of processed tables via a semaphore, this is mainly used to control the workers' memory usage
    # as the sampled tables are submitted to each worker's memory for faster processing (less ser/de overhead).
    # Keep in mind that a table with 8 columns and 10000 rows roughly takes up 150-250MB of memory.
    with Semaphore(max_leases=params.table_retrieval_limit,
                   name='table_retrieval_limit'):
        logger.info(
            f'Starting fetching and sampling {physical_data_source}.{table_name} with {columns_to_consider}'
        )

        table_sample = PhysicalDataSourceSampler(params).sample(
            physical_data_source, table_name, columns_to_consider)

        logger.info(
            f'Fetching and sampling done {physical_data_source}.{table_name} in {time.perf_counter() - _task_start:.2f}'
        )

        with Semaphore(max_leases=params.table_processing_limit,
                       name='table_processing_limit'):
            logger.info(
                f'Running the identifier parser on {physical_data_source}.{table_name}'
            )

            _task_start = time.perf_counter()

            identifiers = IdentifierParser(params).parse(table_sample)

            Measure.histogram(
                'idparser_runtime',
                tags={
                    'physical_data_source': physical_data_source,
                    'table_name': table_name,
                    'num_columns': len(columns_to_consider),
                    'num_ids': len(identifiers),
                },
            )(time.perf_counter() - _task_start)

            return identifiers
def test_close_sync(client):
    sem = Semaphore()
    sem.close()

    with pytest.raises(RuntimeError,
                       match="Semaphore .* not known or already closed."):
        sem.acquire()
Exemple #5
0
def test_threadpoolworkers_pick_correct_ioloop(cleanup):
    # gh4057

    # About picking appropriate values for the various timings
    # * Sleep time in `access_limited` impacts test runtime but is arbitrary
    # * `lease-timeout` should be smaller than the sleep time. This is what the
    #   test builds on. assuming the leases cannot be refreshed, e.g. wrong
    #   event loop picked / PeriodicCallback never scheduled, the semaphore
    #   would become oversubscribed and the len(protected_resources) becomes
    #   non zero. This should also trigger a log message about "unknown leases"
    #   and fails the test.
    # * `lease-validation-interval` interval should be the smallest quantity.
    #   How often leases are checked for staleness is hard coded atm and a fifth
    #   of the `lease-timeout`. Accounting for this and some jitter, this should
    #   be sufficiently small to ensure smooth operation.

    with dask.config.set({
            "distributed.scheduler.locks.lease-validation-interval":
            0.01,
            "distributed.scheduler.locks.lease-timeout":
            0.1,
    }):
        with Client(processes=False,
                    dashboard_address=":0",
                    threads_per_worker=4) as client:
            sem = Semaphore(max_leases=1, name="database")
            protected_resource = []

            def access_limited(val, sem):
                import time

                with sem:
                    assert len(protected_resource) == 0
                    protected_resource.append(val)
                    # Interact with the DB
                    time.sleep(0.2)
                    protected_resource.remove(val)

            client.gather(client.map(access_limited, range(10), sem=sem))
Exemple #6
0
def test_threadpoolworkers_pick_correct_ioloop(cleanup):
    # gh4057

    with dask.config.set({
            "distributed.scheduler.locks.lease-validation-interval":
            0.01,
            "distributed.scheduler.locks.lease-timeout":
            0.05,
    }):
        with Client(processes=False, threads_per_worker=4) as client:
            sem = Semaphore(max_leases=1, name="database")
            protected_ressource = []

            def access_limited(val, sem):
                import time

                with sem:
                    assert len(protected_ressource) == 0
                    protected_ressource.append(val)
                    # Interact with the DB
                    time.sleep(0.1)
                    protected_ressource.remove(val)

            client.gather(client.map(access_limited, range(10), sem=sem))
def test_timeout_sync(client):
    s = Semaphore(name="x")
    # Using the context manager already acquires a lease, so the line below won't be able to acquire another one
    with s:
        assert s.acquire(timeout=0.025) is False