Ejemplo n.º 1
0
def test_spillbuffer_fail_to_serialize(tmpdir):
    buf = SpillBuffer(str(tmpdir), target=200, max_spill=600, min_log_interval=0)

    # bad data individually larger than spill threshold target 200
    a = Bad(size=201)

    # Exception caught in the worker
    with pytest.raises(TypeError, match="Could not serialize"):
        with captured_logger(logging.getLogger("distributed.spill")) as logs_bad_key:
            buf["a"] = a

    # spill.py must remain silent because we're already logging in worker.py
    assert not logs_bad_key.getvalue()
    assert_buf(buf, {}, {})

    b = Bad(size=100)  # this is small enough to fit in memory/fast

    buf["b"] = b
    assert_buf(buf, {"b": b}, {})

    c = "c" * 100
    with captured_logger(logging.getLogger("distributed.spill")) as logs_bad_key_mem:
        # This will go to fast and try to kick b out,
        # but keep b in fast since it's not pickable
        buf["c"] = c

    # worker.py won't intercept the exception here, so spill.py must dump the traceback
    logs_value = logs_bad_key_mem.getvalue()
    assert "Failed to pickle" in logs_value  # from distributed.spill
    assert "Traceback" in logs_value  # from distributed.spill
    assert_buf(buf, {"b": b, "c": c}, {})
Ejemplo n.º 2
0
def test_spillbuffer_maxlim(tmpdir):
    buf = SpillBuffer(str(tmpdir), target=200, max_spill=600, min_log_interval=0)

    a, b, c, d, e = "a" * 200, "b" * 100, "c" * 99, "d" * 199, "e" * 98

    # size of a is bigger than target and is smaller than max_spill;
    # key should be in slow
    buf["a"] = a
    assert_buf(buf, {}, {"a": a})
    assert buf["a"] == a

    # size of b is smaller than target key should be in fast
    buf["b"] = b
    assert_buf(buf, {"b": b}, {"a": a})

    # size of c is smaller than target but b+c > target, c should stay in fast and b
    # move to slow since the max_spill limit has not been reached yet
    buf["c"] = c
    assert_buf(buf, {"c": c}, {"a": a, "b": b})

    # size of e < target but e+c > target, this will trigger movement of c to slow
    # but the max spill limit prevents it. Resulting in e remaining in fast

    with captured_logger(logging.getLogger("distributed.spill")) as logs_e:
        buf["e"] = e

    assert "disk reached capacity" in logs_e.getvalue()
    assert_buf(buf, {"c": c, "e": e}, {"a": a, "b": b})

    # size of d > target, d should go to slow but slow reached the max_spill limit then
    # d will end up on fast with c (which can't be move to slow because it won't fit
    # either)
    with captured_logger(logging.getLogger("distributed.spill")) as logs_d:
        buf["d"] = d

    assert "disk reached capacity" in logs_d.getvalue()
    assert_buf(buf, {"c": c, "d": d, "e": e}, {"a": a, "b": b})

    # Overwrite a key that was in slow, but the size of the new key is larger than
    # max_spill

    a_large = "a" * 500
    assert psize(a_large)[1] > 600  # size of max_spill

    with captured_logger(logging.getLogger("distributed.spill")) as logs_alarge:
        buf["a"] = a_large

    assert "disk reached capacity" in logs_alarge.getvalue()
    assert_buf(buf, {"a": a_large, "d": d, "e": e}, {"b": b, "c": c})

    # Overwrite a key that was in fast, but the size of the new key is larger than
    # max_spill

    d_large = "d" * 501
    with captured_logger(logging.getLogger("distributed.spill")) as logs_dlarge:
        buf["d"] = d_large

    assert "disk reached capacity" in logs_dlarge.getvalue()
    assert_buf(buf, {"a": a_large, "d": d_large, "e": e}, {"b": b, "c": c})
Ejemplo n.º 3
0
def test_spillbuffer_oserror(tmpdir):
    buf = SpillBuffer(str(tmpdir),
                      target=200,
                      max_spill=800,
                      min_log_interval=0)

    a, b, c, d = (
        "a" * 200,
        "b" * 100,
        "c" * 201,
        "d" * 101,
    )

    # let's have something in fast and something in slow
    buf["a"] = a
    buf["b"] = b
    assert set(buf.fast) == {"b"}
    assert set(buf.slow) == {"a"}

    # modify permissions of disk to be read only.
    # This causes writes to raise OSError, just like in case of disk full.
    os.chmod(tmpdir, 0o555)

    # Add key > than target
    with captured_logger(
            logging.getLogger("distributed.spill")) as logs_oserror_slow:
        buf["c"] = c

    assert "Spill to disk failed" in logs_oserror_slow.getvalue()
    assert set(buf.fast) == {"b", "c"}
    assert set(buf.slow) == {"a"}

    assert buf.slow.weight_by_key == {"a": psize(a)}
    assert buf.fast.weights == {"b": sizeof(b), "c": sizeof(c)}

    del buf["c"]
    assert set(buf.fast) == {"b"}
    assert set(buf.slow) == {"a"}

    # add key to fast which is smaller than target but when added it triggers spill,
    # which triggers OSError
    with captured_logger(
            logging.getLogger("distributed.spill")) as logs_oserror_evict:
        buf["d"] = d

    assert "Spill to disk failed" in logs_oserror_evict.getvalue()
    assert set(buf.fast) == {"b", "d"}
    assert set(buf.slow) == {"a"}

    assert buf.slow.weight_by_key == {"a": psize(a)}
    assert buf.fast.weights == {"b": sizeof(b), "d": sizeof(d)}
Ejemplo n.º 4
0
def test_serializers():
    with echo_server() as e:
        comm = yield connect(e.address)

        b = BatchedSend(interval='10ms', serializers=['msgpack'])
        b.start(comm)

        b.send({'x': to_serialize(123)})
        b.send({'x': to_serialize('hello')})
        yield gen.sleep(0.100)

        b.send({'x': to_serialize(lambda x: x + 1)})

        with captured_logger('distributed.protocol') as sio:
            yield gen.sleep(0.100)

        value = sio.getvalue()
        assert 'serialize' in value
        assert 'type' in value
        assert 'function' in value

        msg = yield comm.read()
        assert list(msg) == [{'x': 123}, {'x': 'hello'}]

        with pytest.raises(gen.TimeoutError):
            msg = yield gen.with_timeout(timedelta(milliseconds=100), comm.read())
Ejemplo n.º 5
0
def test_serializers():
    with echo_server() as e:
        comm = yield connect(e.address)

        b = BatchedSend(interval='10ms', serializers=['msgpack'])
        b.start(comm)

        b.send({'x': to_serialize(123)})
        b.send({'x': to_serialize('hello')})
        yield gen.sleep(0.100)

        b.send({'x': to_serialize(lambda x: x + 1)})

        with captured_logger('distributed.protocol') as sio:
            yield gen.sleep(0.100)

        value = sio.getvalue()
        assert 'serialize' in value
        assert 'type' in value
        assert 'function' in value

        msg = yield comm.read()
        assert msg == [{'x': 123}, {'x': 'hello'}]

        with pytest.raises(gen.TimeoutError):
            msg = yield gen.with_timeout(timedelta(milliseconds=100),
                                         comm.read())
async def test_override_data_vs_memory_monitor(c, s, a):
    a.monitor.get_process_memory = lambda: 801_000_000 if a.data else 0
    assert memory_monitor_running(a)

    # Push a key that would normally trip both the target and the spill thresholds
    class C:
        def __sizeof__(self):
            return 801_000_000

    # Capture output of log_errors()
    with captured_logger(logging.getLogger("distributed.utils")) as logger:
        x = c.submit(C)
        await wait(x)

        # The pause subsystem of the memory monitor has been tripped.
        # The spill subsystem hasn't.
        while a.status != Status.paused:
            await asyncio.sleep(0.01)
        await asyncio.sleep(0.05)

    # This would happen if memory_monitor() tried to blindly call SpillBuffer.evict()
    assert "Traceback" not in logger.getvalue()

    assert type(a.data) is UserDict
    assert a.data.keys() == {x.key}
Ejemplo n.º 7
0
def test_spillbuffer_evict(tmpdir):
    buf = SpillBuffer(str(tmpdir), target=300, min_log_interval=0)

    bad = Bad(size=100)
    a = "a" * 100

    buf["a"] = a
    assert_buf(buf, {"a": a}, {})

    # successful eviction
    weight = buf.evict()
    assert weight == sizeof(a)
    assert_buf(buf, {}, {"a": a})

    buf["bad"] = bad
    assert_buf(buf, {"bad": bad}, {"a": a})

    # unsuccessful eviction
    with captured_logger(logging.getLogger("distributed.spill")) as logs_evict_key:
        weight = buf.evict()
    assert weight == -1

    assert "Failed to pickle" in logs_evict_key.getvalue()
    # bad keys stays in fast
    assert_buf(buf, {"bad": bad}, {"a": a})
Ejemplo n.º 8
0
async def test_adapt_oserror_scale():
    """
    FIXME:
    If we encounter an OSError during scale down, we continue as before. It is
    not entirely clear if this is the correct behaviour but defines the current
    state.
    This was probably introduced to protect against comm failures during
    shutdown but the scale down command should be robust call to the scheduler
    which is never scaled down.
    """
    class BadAdaptive(MyAdaptive):
        async def scale_down(self, workers=None):
            raise OSError()

    adapt = BadAdaptive(minimum=1, maximum=4, wait_count=0, interval="10ms")
    adapt._target = 2
    while not adapt.periodic_callback.is_running():
        await asyncio.sleep(0.01)
    await adapt.adapt()
    assert len(adapt.plan) == 2
    assert len(adapt.requested) == 2
    with captured_logger("distributed.deploy.adaptive_core") as log:
        adapt._target = 0
        await adapt.adapt()
    text = log.getvalue()
    assert "Error during adaptive downscaling" in text
    assert not adapt._adapting
    assert adapt.periodic_callback
    assert adapt.periodic_callback.is_running()
    adapt.stop()
Ejemplo n.º 9
0
async def test_serializers():
    async with EchoServer() as e:
        comm = await connect(e.address)

        b = BatchedSend(interval="10ms", serializers=["msgpack"])
        b.start(comm)

        b.send({"x": to_serialize(123)})
        b.send({"x": to_serialize("hello")})
        await asyncio.sleep(0.100)

        b.send({"x": to_serialize(lambda x: x + 1)})

        with captured_logger("distributed.protocol") as sio:
            await asyncio.sleep(0.100)

        value = sio.getvalue()
        assert "serialize" in value
        assert "type" in value
        assert "function" in value

        msg = await comm.read()
        assert list(msg) == [{"x": 123}, {"x": "hello"}]

        with pytest.raises(TimeoutError):
            msg = await asyncio.wait_for(comm.read(), 0.1)
Ejemplo n.º 10
0
async def test_failure_during_worker_initialization(s):
    with captured_logger(logger="distributed.nanny",
                         level=logging.WARNING) as logs:
        with pytest.raises(Exception):
            async with Nanny(s.address, foo="bar") as n:
                await n
    assert "Restarting worker" not in logs.getvalue()
Ejemplo n.º 11
0
def test_pause_executor(c, s, a):
    memory = psutil.Process().memory_info().rss
    a.memory_limit = memory / 0.5 + 200e6
    np = pytest.importorskip("numpy")

    def f():
        x = np.ones(int(400e6), dtype="u1")
        sleep(1)

    with captured_logger(logging.getLogger("distributed.worker")) as logger:
        future = c.submit(f)
        futures = c.map(slowinc, range(30), delay=0.1)

        start = time()
        while not a.paused:
            yield gen.sleep(0.01)
            assert time() < start + 4, (
                format_bytes(psutil.Process().memory_info().rss),
                format_bytes(a.memory_limit),
                len(a.data),
            )
        out = logger.getvalue()
        assert "memory" in out.lower()
        assert "pausing" in out.lower()

    assert sum(f.status == "finished" for f in futures) < 4

    yield wait(futures)
Ejemplo n.º 12
0
def test_serializers():
    with echo_server() as e:
        comm = yield connect(e.address)

        b = BatchedSend(interval="10ms", serializers=["msgpack"])
        b.start(comm)

        b.send({"x": to_serialize(123)})
        b.send({"x": to_serialize("hello")})
        yield gen.sleep(0.100)

        b.send({"x": to_serialize(lambda x: x + 1)})

        with captured_logger("distributed.protocol") as sio:
            yield gen.sleep(0.100)

        value = sio.getvalue()
        assert "serialize" in value
        assert "type" in value
        assert "function" in value

        msg = yield comm.read()
        assert list(msg) == [{"x": 123}, {"x": "hello"}]

        with pytest.raises(gen.TimeoutError):
            msg = yield gen.with_timeout(timedelta(milliseconds=100),
                                         comm.read())
Ejemplo n.º 13
0
def test_logging_default():
    """
    Test default logging configuration.
    """
    d = logging.getLogger('distributed')
    assert len(d.handlers) == 1
    assert isinstance(d.handlers[0], logging.StreamHandler)

    # Work around Bokeh messing with the root logger level
    # https://github.com/bokeh/bokeh/issues/5793
    root = logging.getLogger('')
    old_root_level = root.level
    root.setLevel('WARN')

    for handler in d.handlers:
        handler.setLevel('INFO')

    try:
        dfb = logging.getLogger('distributed.foo.bar')
        f = logging.getLogger('foo')
        fb = logging.getLogger('foo.bar')

        with captured_handler(d.handlers[0]) as distributed_log:
            with captured_logger(root, level=logging.ERROR) as foreign_log:
                h = logging.StreamHandler(foreign_log)
                fmt = '[%(levelname)s in %(name)s] - %(message)s'
                h.setFormatter(logging.Formatter(fmt))
                fb.addHandler(h)
                fb.propagate = False

                # For debugging
                dump_logger_list()

                d.debug("1: debug")
                d.info("2: info")
                dfb.info("3: info")
                fb.info("4: info")
                fb.error("5: error")
                f.info("6: info")
                f.error("7: error")

        distributed_log = distributed_log.getvalue().splitlines()
        foreign_log = foreign_log.getvalue().splitlines()

        # distributed log is configured at INFO level by default
        assert distributed_log == [
            "distributed - INFO - 2: info",
            "distributed.foo.bar - INFO - 3: info",
        ]

        # foreign logs should be unaffected by distributed's logging
        # configuration.  They get the default ERROR level from logging.
        assert foreign_log == [
            "[ERROR in foo.bar] - 5: error",
            "7: error",
        ]

    finally:
        root.setLevel(old_root_level)
Ejemplo n.º 14
0
def test_logging_default():
    """
    Test default logging configuration.
    """
    d = logging.getLogger('distributed')
    assert len(d.handlers) == 1
    assert isinstance(d.handlers[0], logging.StreamHandler)

    # Work around Bokeh messing with the root logger level
    # https://github.com/bokeh/bokeh/issues/5793
    root = logging.getLogger('')
    old_root_level = root.level
    root.setLevel('WARN')

    for handler in d.handlers:
        handler.setLevel('INFO')

    try:
        dfb = logging.getLogger('distributed.foo.bar')
        f = logging.getLogger('foo')
        fb = logging.getLogger('foo.bar')

        with captured_handler(d.handlers[0]) as distributed_log:
            with captured_logger(root, level=logging.ERROR) as foreign_log:
                h = logging.StreamHandler(foreign_log)
                fmt = '[%(levelname)s in %(name)s] - %(message)s'
                h.setFormatter(logging.Formatter(fmt))
                fb.addHandler(h)
                fb.propagate = False

                # For debugging
                dump_logger_list()

                d.debug("1: debug")
                d.info("2: info")
                dfb.info("3: info")
                fb.info("4: info")
                fb.error("5: error")
                f.info("6: info")
                f.error("7: error")

        distributed_log = distributed_log.getvalue().splitlines()
        foreign_log = foreign_log.getvalue().splitlines()

        # distributed log is configured at INFO level by default
        assert distributed_log == [
            "distributed - INFO - 2: info",
            "distributed.foo.bar - INFO - 3: info",
        ]

        # foreign logs should be unaffected by distributed's logging
        # configuration.  They get the default ERROR level from logging.
        assert foreign_log == [
            "[ERROR in foo.bar] - 5: error",
            "7: error",
        ]

    finally:
        root.setLevel(old_root_level)
Ejemplo n.º 15
0
def test_tick_logging(s, a, b):
    pytest.importorskip('crick')
    with set_config(**{'tick-maximum-delay': 10}):
        with captured_logger('distributed.core') as sio:
            yield gen.sleep(0.1)

    text = sio.getvalue()
    assert "unresponsive" in text
Ejemplo n.º 16
0
async def test_restart_timeout_on_long_running_task(c, s, a):
    with captured_logger("distributed.scheduler") as sio:
        future = c.submit(sleep, 3600)
        await asyncio.sleep(0.1)
        await c.restart()

    text = sio.getvalue()
    assert "timeout" not in text.lower()
Ejemplo n.º 17
0
async def test_web_preload(cleanup, scheduler_preload):
    with captured_logger("distributed.preloading") as log:
        async with Scheduler(
                host="localhost",
                preload=["http://127.0.0.1:12345/preload"],
        ) as s:
            assert s.foo == 1
    assert "12345/preload" in log.getvalue()
def test_restart_timeout_on_long_running_task(c, s, a):
    with captured_logger('distributed.scheduler') as sio:
        future = c.submit(sleep, 3600)
        yield gen.sleep(0.1)
        yield c.restart(timeout=20)

    text = sio.getvalue()
    assert 'timeout' not in text.lower()
Ejemplo n.º 19
0
def test_restart_timeout_on_long_running_task(c, s, a):
    with captured_logger('distributed.scheduler') as sio:
        future = c.submit(sleep, 3600)
        yield gen.sleep(0.1)
        yield c.restart(timeout=20)

    text = sio.getvalue()
    assert 'timeout' not in text.lower()
Ejemplo n.º 20
0
async def test_release_failure(c, s, a, b):
    """Don't raise even if release fails: lease will be cleaned up by the lease-validation after
    a specified interval anyways (see config parameters used)."""

    with dask.config.set({"distributed.comm.retry.count": 1}):
        pool = await FlakyConnectionPool(failing_connections=5)

        semaphore = await Semaphore(
            max_leases=2,
            name="resource_we_want_to_limit",
            scheduler_rpc=pool(s.address),
        )
        await semaphore.acquire()
        pool.activate()  # Comm chaos starts

        # Release fails (after a single retry) because of broken connections
        with captured_logger(
            "distributed.semaphore", level=logging.ERROR
        ) as semaphore_log:
            with captured_logger("distributed.utils_comm") as retry_log:
                assert await semaphore.release() is False

        with captured_logger(
            "distributed.semaphore", level=logging.DEBUG
        ) as semaphore_cleanup_log:
            pool.deactivate()  # comm chaos stops
            assert await semaphore.get_value() == 1  # lease is still registered
            await asyncio.sleep(0.2)  # Wait for lease to be cleaned up

        # Check release was retried
        retry_log = retry_log.getvalue().split("\n")[0]
        assert retry_log.startswith(
            "Retrying semaphore release:"
        ) and retry_log.endswith("after exception in attempt 0/1: ")
        # Check release failed
        semaphore_log = semaphore_log.getvalue().split("\n")[0]
        assert semaphore_log.startswith(
            "Release failed for id="
        ) and semaphore_log.endswith("Cluster network might be unstable?")

        # Check lease has timed out
        assert any(
            log.startswith("Lease") and "timed out after" in log
            for log in semaphore_cleanup_log.getvalue().split("\n")
        )
        assert await semaphore.get_value() == 0
Ejemplo n.º 21
0
async def test_delete_unset_variable(c, s, a, b):
    x = Variable()
    assert x.client is c
    with captured_logger(logging.getLogger("distributed.utils")) as logger:
        x.delete()
        await c.close()
    text = logger.getvalue()
    assert "KeyError" not in text
Ejemplo n.º 22
0
async def test_pause_executor_with_memory_monitor(c, s, a):
    assert memory_monitor_running(a)
    mocked_rss = 0
    a.monitor.get_process_memory = lambda: mocked_rss

    # Task that is running when the worker pauses
    ev_x = Event()

    def f(ev):
        ev.wait()
        return 1

    # Task that is running on the worker when the worker pauses
    x = c.submit(f, ev_x, key="x")
    while a.executing_count != 1:
        await asyncio.sleep(0.01)

    with captured_logger(
            logging.getLogger("distributed.worker_memory")) as logger:
        # Task that is queued on the worker when the worker pauses
        y = c.submit(inc, 1, key="y")
        while "y" not in a.tasks:
            await asyncio.sleep(0.01)

        # Hog the worker with 900MB unmanaged memory
        mocked_rss = 900_000_000
        while s.workers[a.address].status != Status.paused:
            await asyncio.sleep(0.01)

        assert "Pausing worker" in logger.getvalue()

        # Task that is queued on the scheduler when the worker pauses.
        # It is not sent to the worker.
        z = c.submit(inc, 2, key="z")
        while "z" not in s.tasks or s.tasks["z"].state != "no-worker":
            await asyncio.sleep(0.01)
        assert s.unrunnable == {s.tasks["z"]}

        # Test that a task that already started when the worker paused can complete
        # and its output can be retrieved. Also test that the now free slot won't be
        # used by other tasks.
        await ev_x.set()
        assert await x == 1
        await asyncio.sleep(0.05)

        assert a.executing_count == 0
        assert len(a.ready) == 1
        assert a.tasks["y"].state == "ready"
        assert "z" not in a.tasks

        # Release the memory. Tasks that were queued on the worker are executed.
        # Tasks that were stuck on the scheduler are sent to the worker and executed.
        mocked_rss = 0
        assert await y == 2
        assert await z == 3

        assert a.status == Status.running
        assert "Resuming worker" in logger.getvalue()
Ejemplo n.º 23
0
def test_wait_for_scheduler():
    with captured_logger('distributed') as log:
        w = Nanny('127.0.0.1:44737')
        w._start()
        yield gen.sleep(6)

    log = log.getvalue()
    assert 'error' not in log.lower(), log
    assert 'restart' not in log.lower(), log
Ejemplo n.º 24
0
def test_wait_for_scheduler():
    with captured_logger("distributed") as log:
        w = Nanny("127.0.0.1:44737")
        w._start()
        yield gen.sleep(6)

    log = log.getvalue()
    assert "error" not in log.lower(), log
    assert "restart" not in log.lower(), log
Ejemplo n.º 25
0
def test_wait_for_scheduler():
    with captured_logger('distributed') as log:
        w = Nanny('127.0.0.1:44737')
        w._start()
        yield gen.sleep(6)

    log = log.getvalue()
    assert 'error' not in log.lower(), log
    assert 'restart' not in log.lower(), log
Ejemplo n.º 26
0
def test_safe_sizeof_logs_on_failure():
    class BadlySized:
        def __sizeof__(self):
            raise ValueError("bar")

    foo = BadlySized()

    # Defaults to 0.95 MiB by default
    with captured_logger(logging.getLogger("distributed.sizeof")) as logs:
        assert safe_sizeof(foo) == 1e6

    assert "Sizeof calculation failed. Defaulting to 0.95 MiB" in logs.getvalue()

    # Can provide custom `default_size`
    with captured_logger(logging.getLogger("distributed.sizeof")) as logs:
        default_size = 2 * (1024 ** 2)  # 2 MiB
        assert safe_sizeof(foo, default_size=default_size) == default_size

    assert "Defaulting to 2.00 MiB" in logs.getvalue()
def assert_amm_log(expect: list[str]):
    with captured_logger("distributed.active_memory_manager",
                         level=logging.DEBUG) as logger:
        yield
    actual = logger.getvalue().splitlines()
    if len(actual) != len(expect) or any(not a.startswith(e)
                                         for a, e in zip(actual, expect)):
        raise AssertionError("Log lines mismatch:\n" + "\n".join(actual) +
                             "\n" + "=" * 80 + "\n" + "Does not match:\n" +
                             "\n".join(expect))
Ejemplo n.º 28
0
def test_safe_sizeof_raises():
    class BadlySized:
        def __sizeof__(self):
            raise ValueError("bar")

    foo = BadlySized()
    with captured_logger(logging.getLogger("distributed.sizeof")) as logs:
        assert safe_sizeof(foo) == 1e6

    assert "Sizeof calculation failed.  Defaulting to 1MB" in logs.getvalue()
Ejemplo n.º 29
0
async def test_no_hang_when_scheduler_closes(s, a, b):
    # https://github.com/dask/distributed/issues/2880
    with captured_logger("tornado.application", logging.ERROR) as logger:
        await s.close()
        await asyncio.sleep(1.2)
        assert a.status == Status.closed
        assert b.status == Status.closed

    out = logger.getvalue()
    assert "Timed out trying to connect" not in out
Ejemplo n.º 30
0
async def test_wait_for_scheduler():
    with captured_logger("distributed") as log:
        w = Nanny("127.0.0.1:44737")
        IOLoop.current().add_callback(w.start)
        await asyncio.sleep(6)
        await w.close()

    log = log.getvalue()
    assert "error" not in log.lower(), log
    assert "restart" not in log.lower(), log
Ejemplo n.º 31
0
def test_workspace_process_crash(tmpdir):
    # WorkSpace should be able to clean up stale contents left by
    # crashed process
    base_dir = str(tmpdir)
    assert_contents = functools.partial(assert_directory_contents, base_dir)

    ws = WorkSpace(base_dir)

    code = """if 1:
        import signal
        import sys
        import time

        from distributed.diskutils import WorkSpace

        ws = WorkSpace(%(base_dir)r)
        a = ws.new_work_dir(name='aa')
        b = ws.new_work_dir(prefix='foo-')
        print((a.dir_path, b.dir_path))
        sys.stdout.flush()

        time.sleep(100)
        """ % dict(
        base_dir=base_dir
    )

    p = subprocess.Popen(
        [sys.executable, "-c", code],
        stdin=subprocess.PIPE,
        stdout=subprocess.PIPE,
        universal_newlines=True,
    )
    line = p.stdout.readline()
    assert p.poll() is None
    a_path, b_path = eval(line)
    assert_contents([a_path, a_path + ".dirlock", b_path, b_path + ".dirlock"])

    # The child process holds a lock so the work dirs shouldn't be removed
    ws._purge_leftovers()
    assert_contents([a_path, a_path + ".dirlock", b_path, b_path + ".dirlock"])

    # Kill the process so it's unable to clear the work dirs itself
    p.kill()
    assert p.wait()  # process returned with non-zero code
    assert_contents([a_path, a_path + ".dirlock", b_path, b_path + ".dirlock"])

    with captured_logger("distributed.diskutils", "INFO", propagate=False) as sio:
        ws._purge_leftovers()
    assert_contents([])
    # One log line per purged directory
    lines = sio.getvalue().splitlines()
    assert len(lines) == 2
    for p in (a_path, b_path):
        assert any(repr(p) in line for line in lines)
Ejemplo n.º 32
0
def test_close_twice():
    with LocalCluster() as cluster:
        with Client(cluster.scheduler_address) as client:
            f = client.map(inc, range(100))
            client.gather(f)
        with captured_logger('tornado.application') as log:
            cluster.close()
            cluster.close()
            sleep(0.5)
        log = log.getvalue()
        assert not log
Ejemplo n.º 33
0
def test_close_twice():
    with LocalCluster() as cluster:
        with Client(cluster.scheduler_address) as client:
            f = client.map(inc, range(100))
            client.gather(f)
        with captured_logger("tornado.application") as log:
            cluster.close()
            cluster.close()
            sleep(0.5)
        log = log.getvalue()
        assert not log
Ejemplo n.º 34
0
def test_lose_task(c, s, a, b):
    with captured_logger('distributed.stealing') as log:
        s.periodic_callbacks['stealing'].interval = 1
        for i in range(100):
            futures = c.map(slowinc, range(10), delay=0.01, pure=False,
                            workers=a.address, allow_other_workers=True)
            yield gen.sleep(0.01)
            del futures

    out = log.getvalue()
    assert 'Error' not in out
Ejemplo n.º 35
0
def test_nanny_timeout(c, s, a):
    x = yield c.scatter(123)
    with captured_logger(logging.getLogger('distributed.nanny'),
                         level=logging.ERROR) as logger:
        response = yield a.restart(timeout=0.1)

    out = logger.getvalue()
    assert 'timed out' in out.lower()

    start = time()
    while x.status != 'cancelled':
        yield gen.sleep(0.1)
        assert time() < start + 7
Ejemplo n.º 36
0
def test_workspace_rmtree_failure(tmpdir):
    base_dir = str(tmpdir)

    ws = WorkSpace(base_dir)
    a = ws.new_work_dir(name='aa')
    shutil.rmtree(a.dir_path)
    with captured_logger('distributed.diskutils', 'ERROR', propagate=False) as sio:
        a.release()
    lines = sio.getvalue().splitlines()
    # shutil.rmtree() may call its onerror callback several times
    assert lines
    for line in lines:
        assert line.startswith("Failed to remove %r" % (a.dir_path,))
Ejemplo n.º 37
0
def test_nanny_timeout(c, s, a):
    x = yield c.scatter(123)
    with captured_logger(logging.getLogger("distributed.nanny"),
                         level=logging.ERROR) as logger:
        response = yield a.restart(timeout=0.1)

    out = logger.getvalue()
    assert "timed out" in out.lower()

    start = time()
    while x.status != "cancelled":
        yield gen.sleep(0.1)
        assert time() < start + 7
Ejemplo n.º 38
0
def enable_gc_diagnosis_and_log(diag, level='INFO'):
    disable_gc_diagnosis(force=True)  # just in case
    assert not gc.callbacks

    with captured_logger('distributed.utils_perf', level=level,
                         propagate=False) as sio:
        gc.disable()
        gc.collect()  # drain any leftover from previous tests
        diag.enable()
        try:
            yield sio
        finally:
            diag.disable()
            gc.enable()
Ejemplo n.º 39
0
def test_tick_logging(s, a, b):
    pytest.importorskip('crick')
    from distributed import core
    old = core.tick_maximum_delay
    core.tick_maximum_delay = 0.001
    try:
        with captured_logger('distributed.core') as sio:
            yield gen.sleep(0.1)

        text = sio.getvalue()
        assert "unresponsive" in text
        assert 'Scheduler' in text or 'Worker' in text
    finally:
        core.tick_maximum_delay = old
Ejemplo n.º 40
0
def test_workspace_process_crash(tmpdir):
    # WorkSpace should be able to clean up stale contents left by
    # crashed process
    base_dir = str(tmpdir)
    assert_contents = functools.partial(assert_directory_contents, base_dir)

    ws = WorkSpace(base_dir)

    code = """if 1:
        import signal
        import sys
        import time

        from distributed.diskutils import WorkSpace

        ws = WorkSpace(%(base_dir)r)
        a = ws.new_work_dir(name='aa')
        b = ws.new_work_dir(prefix='foo-')
        print((a.dir_path, b.dir_path))
        sys.stdout.flush()

        time.sleep(100)
        """ % dict(base_dir=base_dir)

    p = subprocess.Popen([sys.executable, '-c', code],
                         stdin=subprocess.PIPE, stdout=subprocess.PIPE,
                         universal_newlines=True)
    line = p.stdout.readline()
    assert p.poll() is None
    a_path, b_path = eval(line)
    assert_contents([a_path, a_path + '.dirlock', b_path, b_path + '.dirlock'])

    # The child process holds a lock so the work dirs shouldn't be removed
    ws._purge_leftovers()
    assert_contents([a_path, a_path + '.dirlock', b_path, b_path + '.dirlock'])

    # Kill the process so it's unable to clear the work dirs itself
    p.kill()
    assert p.wait()  # process returned with non-zero code
    assert_contents([a_path, a_path + '.dirlock', b_path, b_path + '.dirlock'])

    with captured_logger('distributed.diskutils', 'INFO', propagate=False) as sio:
        ws._purge_leftovers()
    assert_contents([])
    # One log line per purged directory
    lines = sio.getvalue().splitlines()
    assert len(lines) == 2
    for p in (a_path, b_path):
        assert any(repr(p) in line for line in lines)
Ejemplo n.º 41
0
def test_all_exceptions_logging():
    @gen.coroutine
    def throws():
        raise Exception('foo1234')

    with captured_logger('') as sio:
        try:
            yield All([throws() for _ in range(5)],
                      quiet_exceptions=Exception)
        except Exception:
            pass

        import gc; gc.collect()
        yield gen.sleep(0.1)

    assert 'foo1234' not in sio.getvalue()
Ejemplo n.º 42
0
def test_logging():
    """
    Test default logging configuration.
    """
    d = logging.getLogger('distributed')
    assert len(d.handlers) == 1
    assert isinstance(d.handlers[0], logging.StreamHandler)

    root = logging.getLogger('')
    dfb = logging.getLogger('distributed.foo.bar')
    f = logging.getLogger('foo')
    fb = logging.getLogger('foo.bar')

    with captured_handler(d.handlers[0]) as distributed_log:
        with captured_logger(root) as foreign_log:
            h = logging.StreamHandler(foreign_log)
            fmt = '[%(levelname)s in %(name)s] - %(message)s'
            h.setFormatter(logging.Formatter(fmt))
            fb.addHandler(h)
            fb.propagate = False

            d.debug("1: debug")
            d.info("2: info")
            dfb.info("3: info")
            fb.info("4: info")
            fb.error("5: error")
            f.info("6: info")
            f.error("7: error")

    distributed_log = distributed_log.getvalue().splitlines()
    foreign_log = foreign_log.getvalue().splitlines()

    # distributed log is configured at INFO level by default
    assert distributed_log == [
        "distributed - INFO - 2: info",
        "distributed.foo.bar - INFO - 3: info",
        ]

    # foreign logs should be unaffected by distributed's logging
    # configuration.  They get the default ERROR level from logging.
    assert foreign_log == [
        "[ERROR in foo.bar] - 5: error",
        "7: error",
        ]
Ejemplo n.º 43
0
def test_nanny_terminate(c, s, a):
    from time import sleep

    def leak():
        L = []
        while True:
            L.append(b'0' * 5000000)
            sleep(0.01)

    proc = a.process.pid
    with captured_logger(logging.getLogger('distributed.nanny')) as logger:
        future = c.submit(leak)
        start = time()
        while a.process.pid == proc:
            yield gen.sleep(0.1)
            assert time() < start + 10
        out = logger.getvalue()
        assert 'restart' in out.lower()
        assert 'memory' in out.lower()
Ejemplo n.º 44
0
def _workspace_concurrency(base_dir, purged_q, err_q, stop_evt):
    ws = WorkSpace(base_dir)
    n_purged = 0
    with captured_logger('distributed.diskutils', 'ERROR') as sio:
        while not stop_evt.is_set():
            # Add a bunch of locks, and simulate forgetting them
            try:
                purged = ws._purge_leftovers()
            except Exception as e:
                err_q.put(e)
            else:
                n_purged += len(purged)

    lines = sio.getvalue().splitlines()
    if lines:
        try:
            raise AssertionError("got %d logs, see stderr" % (len(lines,)))
        except Exception as e:
            err_q.put(e)

    purged_q.put(n_purged)
Ejemplo n.º 45
0
def test_pause_executor(c, s, a):
    memory = psutil.Process().memory_info().rss
    a.memory_limit = memory / 0.8 + 200e6
    np = pytest.importorskip('numpy')

    def f():
        x = np.ones(int(300e6), dtype='u1')
        sleep(1)

    with captured_logger(logging.getLogger('distributed.worker')) as logger:
        future = c.submit(f)
        futures = c.map(slowinc, range(10), delay=0.1)

        yield gen.sleep(0.3)
        assert a.paused, (format_bytes(psutil.Process().memory_info().rss),
                          format_bytes(a.memory_limit))
        out = logger.getvalue()
        assert 'memory' in out.lower()
        assert 'pausing' in out.lower()

    assert sum(f.status == 'finished' for f in futures) < 4

    yield wait(futures)