def test_spillbuffer_fail_to_serialize(tmpdir): buf = SpillBuffer(str(tmpdir), target=200, max_spill=600, min_log_interval=0) # bad data individually larger than spill threshold target 200 a = Bad(size=201) # Exception caught in the worker with pytest.raises(TypeError, match="Could not serialize"): with captured_logger(logging.getLogger("distributed.spill")) as logs_bad_key: buf["a"] = a # spill.py must remain silent because we're already logging in worker.py assert not logs_bad_key.getvalue() assert_buf(buf, {}, {}) b = Bad(size=100) # this is small enough to fit in memory/fast buf["b"] = b assert_buf(buf, {"b": b}, {}) c = "c" * 100 with captured_logger(logging.getLogger("distributed.spill")) as logs_bad_key_mem: # This will go to fast and try to kick b out, # but keep b in fast since it's not pickable buf["c"] = c # worker.py won't intercept the exception here, so spill.py must dump the traceback logs_value = logs_bad_key_mem.getvalue() assert "Failed to pickle" in logs_value # from distributed.spill assert "Traceback" in logs_value # from distributed.spill assert_buf(buf, {"b": b, "c": c}, {})
def test_spillbuffer_maxlim(tmpdir): buf = SpillBuffer(str(tmpdir), target=200, max_spill=600, min_log_interval=0) a, b, c, d, e = "a" * 200, "b" * 100, "c" * 99, "d" * 199, "e" * 98 # size of a is bigger than target and is smaller than max_spill; # key should be in slow buf["a"] = a assert_buf(buf, {}, {"a": a}) assert buf["a"] == a # size of b is smaller than target key should be in fast buf["b"] = b assert_buf(buf, {"b": b}, {"a": a}) # size of c is smaller than target but b+c > target, c should stay in fast and b # move to slow since the max_spill limit has not been reached yet buf["c"] = c assert_buf(buf, {"c": c}, {"a": a, "b": b}) # size of e < target but e+c > target, this will trigger movement of c to slow # but the max spill limit prevents it. Resulting in e remaining in fast with captured_logger(logging.getLogger("distributed.spill")) as logs_e: buf["e"] = e assert "disk reached capacity" in logs_e.getvalue() assert_buf(buf, {"c": c, "e": e}, {"a": a, "b": b}) # size of d > target, d should go to slow but slow reached the max_spill limit then # d will end up on fast with c (which can't be move to slow because it won't fit # either) with captured_logger(logging.getLogger("distributed.spill")) as logs_d: buf["d"] = d assert "disk reached capacity" in logs_d.getvalue() assert_buf(buf, {"c": c, "d": d, "e": e}, {"a": a, "b": b}) # Overwrite a key that was in slow, but the size of the new key is larger than # max_spill a_large = "a" * 500 assert psize(a_large)[1] > 600 # size of max_spill with captured_logger(logging.getLogger("distributed.spill")) as logs_alarge: buf["a"] = a_large assert "disk reached capacity" in logs_alarge.getvalue() assert_buf(buf, {"a": a_large, "d": d, "e": e}, {"b": b, "c": c}) # Overwrite a key that was in fast, but the size of the new key is larger than # max_spill d_large = "d" * 501 with captured_logger(logging.getLogger("distributed.spill")) as logs_dlarge: buf["d"] = d_large assert "disk reached capacity" in logs_dlarge.getvalue() assert_buf(buf, {"a": a_large, "d": d_large, "e": e}, {"b": b, "c": c})
def test_spillbuffer_oserror(tmpdir): buf = SpillBuffer(str(tmpdir), target=200, max_spill=800, min_log_interval=0) a, b, c, d = ( "a" * 200, "b" * 100, "c" * 201, "d" * 101, ) # let's have something in fast and something in slow buf["a"] = a buf["b"] = b assert set(buf.fast) == {"b"} assert set(buf.slow) == {"a"} # modify permissions of disk to be read only. # This causes writes to raise OSError, just like in case of disk full. os.chmod(tmpdir, 0o555) # Add key > than target with captured_logger( logging.getLogger("distributed.spill")) as logs_oserror_slow: buf["c"] = c assert "Spill to disk failed" in logs_oserror_slow.getvalue() assert set(buf.fast) == {"b", "c"} assert set(buf.slow) == {"a"} assert buf.slow.weight_by_key == {"a": psize(a)} assert buf.fast.weights == {"b": sizeof(b), "c": sizeof(c)} del buf["c"] assert set(buf.fast) == {"b"} assert set(buf.slow) == {"a"} # add key to fast which is smaller than target but when added it triggers spill, # which triggers OSError with captured_logger( logging.getLogger("distributed.spill")) as logs_oserror_evict: buf["d"] = d assert "Spill to disk failed" in logs_oserror_evict.getvalue() assert set(buf.fast) == {"b", "d"} assert set(buf.slow) == {"a"} assert buf.slow.weight_by_key == {"a": psize(a)} assert buf.fast.weights == {"b": sizeof(b), "d": sizeof(d)}
def test_serializers(): with echo_server() as e: comm = yield connect(e.address) b = BatchedSend(interval='10ms', serializers=['msgpack']) b.start(comm) b.send({'x': to_serialize(123)}) b.send({'x': to_serialize('hello')}) yield gen.sleep(0.100) b.send({'x': to_serialize(lambda x: x + 1)}) with captured_logger('distributed.protocol') as sio: yield gen.sleep(0.100) value = sio.getvalue() assert 'serialize' in value assert 'type' in value assert 'function' in value msg = yield comm.read() assert list(msg) == [{'x': 123}, {'x': 'hello'}] with pytest.raises(gen.TimeoutError): msg = yield gen.with_timeout(timedelta(milliseconds=100), comm.read())
def test_serializers(): with echo_server() as e: comm = yield connect(e.address) b = BatchedSend(interval='10ms', serializers=['msgpack']) b.start(comm) b.send({'x': to_serialize(123)}) b.send({'x': to_serialize('hello')}) yield gen.sleep(0.100) b.send({'x': to_serialize(lambda x: x + 1)}) with captured_logger('distributed.protocol') as sio: yield gen.sleep(0.100) value = sio.getvalue() assert 'serialize' in value assert 'type' in value assert 'function' in value msg = yield comm.read() assert msg == [{'x': 123}, {'x': 'hello'}] with pytest.raises(gen.TimeoutError): msg = yield gen.with_timeout(timedelta(milliseconds=100), comm.read())
async def test_override_data_vs_memory_monitor(c, s, a): a.monitor.get_process_memory = lambda: 801_000_000 if a.data else 0 assert memory_monitor_running(a) # Push a key that would normally trip both the target and the spill thresholds class C: def __sizeof__(self): return 801_000_000 # Capture output of log_errors() with captured_logger(logging.getLogger("distributed.utils")) as logger: x = c.submit(C) await wait(x) # The pause subsystem of the memory monitor has been tripped. # The spill subsystem hasn't. while a.status != Status.paused: await asyncio.sleep(0.01) await asyncio.sleep(0.05) # This would happen if memory_monitor() tried to blindly call SpillBuffer.evict() assert "Traceback" not in logger.getvalue() assert type(a.data) is UserDict assert a.data.keys() == {x.key}
def test_spillbuffer_evict(tmpdir): buf = SpillBuffer(str(tmpdir), target=300, min_log_interval=0) bad = Bad(size=100) a = "a" * 100 buf["a"] = a assert_buf(buf, {"a": a}, {}) # successful eviction weight = buf.evict() assert weight == sizeof(a) assert_buf(buf, {}, {"a": a}) buf["bad"] = bad assert_buf(buf, {"bad": bad}, {"a": a}) # unsuccessful eviction with captured_logger(logging.getLogger("distributed.spill")) as logs_evict_key: weight = buf.evict() assert weight == -1 assert "Failed to pickle" in logs_evict_key.getvalue() # bad keys stays in fast assert_buf(buf, {"bad": bad}, {"a": a})
async def test_adapt_oserror_scale(): """ FIXME: If we encounter an OSError during scale down, we continue as before. It is not entirely clear if this is the correct behaviour but defines the current state. This was probably introduced to protect against comm failures during shutdown but the scale down command should be robust call to the scheduler which is never scaled down. """ class BadAdaptive(MyAdaptive): async def scale_down(self, workers=None): raise OSError() adapt = BadAdaptive(minimum=1, maximum=4, wait_count=0, interval="10ms") adapt._target = 2 while not adapt.periodic_callback.is_running(): await asyncio.sleep(0.01) await adapt.adapt() assert len(adapt.plan) == 2 assert len(adapt.requested) == 2 with captured_logger("distributed.deploy.adaptive_core") as log: adapt._target = 0 await adapt.adapt() text = log.getvalue() assert "Error during adaptive downscaling" in text assert not adapt._adapting assert adapt.periodic_callback assert adapt.periodic_callback.is_running() adapt.stop()
async def test_serializers(): async with EchoServer() as e: comm = await connect(e.address) b = BatchedSend(interval="10ms", serializers=["msgpack"]) b.start(comm) b.send({"x": to_serialize(123)}) b.send({"x": to_serialize("hello")}) await asyncio.sleep(0.100) b.send({"x": to_serialize(lambda x: x + 1)}) with captured_logger("distributed.protocol") as sio: await asyncio.sleep(0.100) value = sio.getvalue() assert "serialize" in value assert "type" in value assert "function" in value msg = await comm.read() assert list(msg) == [{"x": 123}, {"x": "hello"}] with pytest.raises(TimeoutError): msg = await asyncio.wait_for(comm.read(), 0.1)
async def test_failure_during_worker_initialization(s): with captured_logger(logger="distributed.nanny", level=logging.WARNING) as logs: with pytest.raises(Exception): async with Nanny(s.address, foo="bar") as n: await n assert "Restarting worker" not in logs.getvalue()
def test_pause_executor(c, s, a): memory = psutil.Process().memory_info().rss a.memory_limit = memory / 0.5 + 200e6 np = pytest.importorskip("numpy") def f(): x = np.ones(int(400e6), dtype="u1") sleep(1) with captured_logger(logging.getLogger("distributed.worker")) as logger: future = c.submit(f) futures = c.map(slowinc, range(30), delay=0.1) start = time() while not a.paused: yield gen.sleep(0.01) assert time() < start + 4, ( format_bytes(psutil.Process().memory_info().rss), format_bytes(a.memory_limit), len(a.data), ) out = logger.getvalue() assert "memory" in out.lower() assert "pausing" in out.lower() assert sum(f.status == "finished" for f in futures) < 4 yield wait(futures)
def test_serializers(): with echo_server() as e: comm = yield connect(e.address) b = BatchedSend(interval="10ms", serializers=["msgpack"]) b.start(comm) b.send({"x": to_serialize(123)}) b.send({"x": to_serialize("hello")}) yield gen.sleep(0.100) b.send({"x": to_serialize(lambda x: x + 1)}) with captured_logger("distributed.protocol") as sio: yield gen.sleep(0.100) value = sio.getvalue() assert "serialize" in value assert "type" in value assert "function" in value msg = yield comm.read() assert list(msg) == [{"x": 123}, {"x": "hello"}] with pytest.raises(gen.TimeoutError): msg = yield gen.with_timeout(timedelta(milliseconds=100), comm.read())
def test_logging_default(): """ Test default logging configuration. """ d = logging.getLogger('distributed') assert len(d.handlers) == 1 assert isinstance(d.handlers[0], logging.StreamHandler) # Work around Bokeh messing with the root logger level # https://github.com/bokeh/bokeh/issues/5793 root = logging.getLogger('') old_root_level = root.level root.setLevel('WARN') for handler in d.handlers: handler.setLevel('INFO') try: dfb = logging.getLogger('distributed.foo.bar') f = logging.getLogger('foo') fb = logging.getLogger('foo.bar') with captured_handler(d.handlers[0]) as distributed_log: with captured_logger(root, level=logging.ERROR) as foreign_log: h = logging.StreamHandler(foreign_log) fmt = '[%(levelname)s in %(name)s] - %(message)s' h.setFormatter(logging.Formatter(fmt)) fb.addHandler(h) fb.propagate = False # For debugging dump_logger_list() d.debug("1: debug") d.info("2: info") dfb.info("3: info") fb.info("4: info") fb.error("5: error") f.info("6: info") f.error("7: error") distributed_log = distributed_log.getvalue().splitlines() foreign_log = foreign_log.getvalue().splitlines() # distributed log is configured at INFO level by default assert distributed_log == [ "distributed - INFO - 2: info", "distributed.foo.bar - INFO - 3: info", ] # foreign logs should be unaffected by distributed's logging # configuration. They get the default ERROR level from logging. assert foreign_log == [ "[ERROR in foo.bar] - 5: error", "7: error", ] finally: root.setLevel(old_root_level)
def test_logging_default(): """ Test default logging configuration. """ d = logging.getLogger('distributed') assert len(d.handlers) == 1 assert isinstance(d.handlers[0], logging.StreamHandler) # Work around Bokeh messing with the root logger level # https://github.com/bokeh/bokeh/issues/5793 root = logging.getLogger('') old_root_level = root.level root.setLevel('WARN') for handler in d.handlers: handler.setLevel('INFO') try: dfb = logging.getLogger('distributed.foo.bar') f = logging.getLogger('foo') fb = logging.getLogger('foo.bar') with captured_handler(d.handlers[0]) as distributed_log: with captured_logger(root, level=logging.ERROR) as foreign_log: h = logging.StreamHandler(foreign_log) fmt = '[%(levelname)s in %(name)s] - %(message)s' h.setFormatter(logging.Formatter(fmt)) fb.addHandler(h) fb.propagate = False # For debugging dump_logger_list() d.debug("1: debug") d.info("2: info") dfb.info("3: info") fb.info("4: info") fb.error("5: error") f.info("6: info") f.error("7: error") distributed_log = distributed_log.getvalue().splitlines() foreign_log = foreign_log.getvalue().splitlines() # distributed log is configured at INFO level by default assert distributed_log == [ "distributed - INFO - 2: info", "distributed.foo.bar - INFO - 3: info", ] # foreign logs should be unaffected by distributed's logging # configuration. They get the default ERROR level from logging. assert foreign_log == [ "[ERROR in foo.bar] - 5: error", "7: error", ] finally: root.setLevel(old_root_level)
def test_tick_logging(s, a, b): pytest.importorskip('crick') with set_config(**{'tick-maximum-delay': 10}): with captured_logger('distributed.core') as sio: yield gen.sleep(0.1) text = sio.getvalue() assert "unresponsive" in text
async def test_restart_timeout_on_long_running_task(c, s, a): with captured_logger("distributed.scheduler") as sio: future = c.submit(sleep, 3600) await asyncio.sleep(0.1) await c.restart() text = sio.getvalue() assert "timeout" not in text.lower()
async def test_web_preload(cleanup, scheduler_preload): with captured_logger("distributed.preloading") as log: async with Scheduler( host="localhost", preload=["http://127.0.0.1:12345/preload"], ) as s: assert s.foo == 1 assert "12345/preload" in log.getvalue()
def test_restart_timeout_on_long_running_task(c, s, a): with captured_logger('distributed.scheduler') as sio: future = c.submit(sleep, 3600) yield gen.sleep(0.1) yield c.restart(timeout=20) text = sio.getvalue() assert 'timeout' not in text.lower()
def test_restart_timeout_on_long_running_task(c, s, a): with captured_logger('distributed.scheduler') as sio: future = c.submit(sleep, 3600) yield gen.sleep(0.1) yield c.restart(timeout=20) text = sio.getvalue() assert 'timeout' not in text.lower()
async def test_release_failure(c, s, a, b): """Don't raise even if release fails: lease will be cleaned up by the lease-validation after a specified interval anyways (see config parameters used).""" with dask.config.set({"distributed.comm.retry.count": 1}): pool = await FlakyConnectionPool(failing_connections=5) semaphore = await Semaphore( max_leases=2, name="resource_we_want_to_limit", scheduler_rpc=pool(s.address), ) await semaphore.acquire() pool.activate() # Comm chaos starts # Release fails (after a single retry) because of broken connections with captured_logger( "distributed.semaphore", level=logging.ERROR ) as semaphore_log: with captured_logger("distributed.utils_comm") as retry_log: assert await semaphore.release() is False with captured_logger( "distributed.semaphore", level=logging.DEBUG ) as semaphore_cleanup_log: pool.deactivate() # comm chaos stops assert await semaphore.get_value() == 1 # lease is still registered await asyncio.sleep(0.2) # Wait for lease to be cleaned up # Check release was retried retry_log = retry_log.getvalue().split("\n")[0] assert retry_log.startswith( "Retrying semaphore release:" ) and retry_log.endswith("after exception in attempt 0/1: ") # Check release failed semaphore_log = semaphore_log.getvalue().split("\n")[0] assert semaphore_log.startswith( "Release failed for id=" ) and semaphore_log.endswith("Cluster network might be unstable?") # Check lease has timed out assert any( log.startswith("Lease") and "timed out after" in log for log in semaphore_cleanup_log.getvalue().split("\n") ) assert await semaphore.get_value() == 0
async def test_delete_unset_variable(c, s, a, b): x = Variable() assert x.client is c with captured_logger(logging.getLogger("distributed.utils")) as logger: x.delete() await c.close() text = logger.getvalue() assert "KeyError" not in text
async def test_pause_executor_with_memory_monitor(c, s, a): assert memory_monitor_running(a) mocked_rss = 0 a.monitor.get_process_memory = lambda: mocked_rss # Task that is running when the worker pauses ev_x = Event() def f(ev): ev.wait() return 1 # Task that is running on the worker when the worker pauses x = c.submit(f, ev_x, key="x") while a.executing_count != 1: await asyncio.sleep(0.01) with captured_logger( logging.getLogger("distributed.worker_memory")) as logger: # Task that is queued on the worker when the worker pauses y = c.submit(inc, 1, key="y") while "y" not in a.tasks: await asyncio.sleep(0.01) # Hog the worker with 900MB unmanaged memory mocked_rss = 900_000_000 while s.workers[a.address].status != Status.paused: await asyncio.sleep(0.01) assert "Pausing worker" in logger.getvalue() # Task that is queued on the scheduler when the worker pauses. # It is not sent to the worker. z = c.submit(inc, 2, key="z") while "z" not in s.tasks or s.tasks["z"].state != "no-worker": await asyncio.sleep(0.01) assert s.unrunnable == {s.tasks["z"]} # Test that a task that already started when the worker paused can complete # and its output can be retrieved. Also test that the now free slot won't be # used by other tasks. await ev_x.set() assert await x == 1 await asyncio.sleep(0.05) assert a.executing_count == 0 assert len(a.ready) == 1 assert a.tasks["y"].state == "ready" assert "z" not in a.tasks # Release the memory. Tasks that were queued on the worker are executed. # Tasks that were stuck on the scheduler are sent to the worker and executed. mocked_rss = 0 assert await y == 2 assert await z == 3 assert a.status == Status.running assert "Resuming worker" in logger.getvalue()
def test_wait_for_scheduler(): with captured_logger('distributed') as log: w = Nanny('127.0.0.1:44737') w._start() yield gen.sleep(6) log = log.getvalue() assert 'error' not in log.lower(), log assert 'restart' not in log.lower(), log
def test_wait_for_scheduler(): with captured_logger("distributed") as log: w = Nanny("127.0.0.1:44737") w._start() yield gen.sleep(6) log = log.getvalue() assert "error" not in log.lower(), log assert "restart" not in log.lower(), log
def test_wait_for_scheduler(): with captured_logger('distributed') as log: w = Nanny('127.0.0.1:44737') w._start() yield gen.sleep(6) log = log.getvalue() assert 'error' not in log.lower(), log assert 'restart' not in log.lower(), log
def test_safe_sizeof_logs_on_failure(): class BadlySized: def __sizeof__(self): raise ValueError("bar") foo = BadlySized() # Defaults to 0.95 MiB by default with captured_logger(logging.getLogger("distributed.sizeof")) as logs: assert safe_sizeof(foo) == 1e6 assert "Sizeof calculation failed. Defaulting to 0.95 MiB" in logs.getvalue() # Can provide custom `default_size` with captured_logger(logging.getLogger("distributed.sizeof")) as logs: default_size = 2 * (1024 ** 2) # 2 MiB assert safe_sizeof(foo, default_size=default_size) == default_size assert "Defaulting to 2.00 MiB" in logs.getvalue()
def assert_amm_log(expect: list[str]): with captured_logger("distributed.active_memory_manager", level=logging.DEBUG) as logger: yield actual = logger.getvalue().splitlines() if len(actual) != len(expect) or any(not a.startswith(e) for a, e in zip(actual, expect)): raise AssertionError("Log lines mismatch:\n" + "\n".join(actual) + "\n" + "=" * 80 + "\n" + "Does not match:\n" + "\n".join(expect))
def test_safe_sizeof_raises(): class BadlySized: def __sizeof__(self): raise ValueError("bar") foo = BadlySized() with captured_logger(logging.getLogger("distributed.sizeof")) as logs: assert safe_sizeof(foo) == 1e6 assert "Sizeof calculation failed. Defaulting to 1MB" in logs.getvalue()
async def test_no_hang_when_scheduler_closes(s, a, b): # https://github.com/dask/distributed/issues/2880 with captured_logger("tornado.application", logging.ERROR) as logger: await s.close() await asyncio.sleep(1.2) assert a.status == Status.closed assert b.status == Status.closed out = logger.getvalue() assert "Timed out trying to connect" not in out
async def test_wait_for_scheduler(): with captured_logger("distributed") as log: w = Nanny("127.0.0.1:44737") IOLoop.current().add_callback(w.start) await asyncio.sleep(6) await w.close() log = log.getvalue() assert "error" not in log.lower(), log assert "restart" not in log.lower(), log
def test_workspace_process_crash(tmpdir): # WorkSpace should be able to clean up stale contents left by # crashed process base_dir = str(tmpdir) assert_contents = functools.partial(assert_directory_contents, base_dir) ws = WorkSpace(base_dir) code = """if 1: import signal import sys import time from distributed.diskutils import WorkSpace ws = WorkSpace(%(base_dir)r) a = ws.new_work_dir(name='aa') b = ws.new_work_dir(prefix='foo-') print((a.dir_path, b.dir_path)) sys.stdout.flush() time.sleep(100) """ % dict( base_dir=base_dir ) p = subprocess.Popen( [sys.executable, "-c", code], stdin=subprocess.PIPE, stdout=subprocess.PIPE, universal_newlines=True, ) line = p.stdout.readline() assert p.poll() is None a_path, b_path = eval(line) assert_contents([a_path, a_path + ".dirlock", b_path, b_path + ".dirlock"]) # The child process holds a lock so the work dirs shouldn't be removed ws._purge_leftovers() assert_contents([a_path, a_path + ".dirlock", b_path, b_path + ".dirlock"]) # Kill the process so it's unable to clear the work dirs itself p.kill() assert p.wait() # process returned with non-zero code assert_contents([a_path, a_path + ".dirlock", b_path, b_path + ".dirlock"]) with captured_logger("distributed.diskutils", "INFO", propagate=False) as sio: ws._purge_leftovers() assert_contents([]) # One log line per purged directory lines = sio.getvalue().splitlines() assert len(lines) == 2 for p in (a_path, b_path): assert any(repr(p) in line for line in lines)
def test_close_twice(): with LocalCluster() as cluster: with Client(cluster.scheduler_address) as client: f = client.map(inc, range(100)) client.gather(f) with captured_logger('tornado.application') as log: cluster.close() cluster.close() sleep(0.5) log = log.getvalue() assert not log
def test_close_twice(): with LocalCluster() as cluster: with Client(cluster.scheduler_address) as client: f = client.map(inc, range(100)) client.gather(f) with captured_logger("tornado.application") as log: cluster.close() cluster.close() sleep(0.5) log = log.getvalue() assert not log
def test_lose_task(c, s, a, b): with captured_logger('distributed.stealing') as log: s.periodic_callbacks['stealing'].interval = 1 for i in range(100): futures = c.map(slowinc, range(10), delay=0.01, pure=False, workers=a.address, allow_other_workers=True) yield gen.sleep(0.01) del futures out = log.getvalue() assert 'Error' not in out
def test_nanny_timeout(c, s, a): x = yield c.scatter(123) with captured_logger(logging.getLogger('distributed.nanny'), level=logging.ERROR) as logger: response = yield a.restart(timeout=0.1) out = logger.getvalue() assert 'timed out' in out.lower() start = time() while x.status != 'cancelled': yield gen.sleep(0.1) assert time() < start + 7
def test_workspace_rmtree_failure(tmpdir): base_dir = str(tmpdir) ws = WorkSpace(base_dir) a = ws.new_work_dir(name='aa') shutil.rmtree(a.dir_path) with captured_logger('distributed.diskutils', 'ERROR', propagate=False) as sio: a.release() lines = sio.getvalue().splitlines() # shutil.rmtree() may call its onerror callback several times assert lines for line in lines: assert line.startswith("Failed to remove %r" % (a.dir_path,))
def test_nanny_timeout(c, s, a): x = yield c.scatter(123) with captured_logger(logging.getLogger("distributed.nanny"), level=logging.ERROR) as logger: response = yield a.restart(timeout=0.1) out = logger.getvalue() assert "timed out" in out.lower() start = time() while x.status != "cancelled": yield gen.sleep(0.1) assert time() < start + 7
def enable_gc_diagnosis_and_log(diag, level='INFO'): disable_gc_diagnosis(force=True) # just in case assert not gc.callbacks with captured_logger('distributed.utils_perf', level=level, propagate=False) as sio: gc.disable() gc.collect() # drain any leftover from previous tests diag.enable() try: yield sio finally: diag.disable() gc.enable()
def test_tick_logging(s, a, b): pytest.importorskip('crick') from distributed import core old = core.tick_maximum_delay core.tick_maximum_delay = 0.001 try: with captured_logger('distributed.core') as sio: yield gen.sleep(0.1) text = sio.getvalue() assert "unresponsive" in text assert 'Scheduler' in text or 'Worker' in text finally: core.tick_maximum_delay = old
def test_workspace_process_crash(tmpdir): # WorkSpace should be able to clean up stale contents left by # crashed process base_dir = str(tmpdir) assert_contents = functools.partial(assert_directory_contents, base_dir) ws = WorkSpace(base_dir) code = """if 1: import signal import sys import time from distributed.diskutils import WorkSpace ws = WorkSpace(%(base_dir)r) a = ws.new_work_dir(name='aa') b = ws.new_work_dir(prefix='foo-') print((a.dir_path, b.dir_path)) sys.stdout.flush() time.sleep(100) """ % dict(base_dir=base_dir) p = subprocess.Popen([sys.executable, '-c', code], stdin=subprocess.PIPE, stdout=subprocess.PIPE, universal_newlines=True) line = p.stdout.readline() assert p.poll() is None a_path, b_path = eval(line) assert_contents([a_path, a_path + '.dirlock', b_path, b_path + '.dirlock']) # The child process holds a lock so the work dirs shouldn't be removed ws._purge_leftovers() assert_contents([a_path, a_path + '.dirlock', b_path, b_path + '.dirlock']) # Kill the process so it's unable to clear the work dirs itself p.kill() assert p.wait() # process returned with non-zero code assert_contents([a_path, a_path + '.dirlock', b_path, b_path + '.dirlock']) with captured_logger('distributed.diskutils', 'INFO', propagate=False) as sio: ws._purge_leftovers() assert_contents([]) # One log line per purged directory lines = sio.getvalue().splitlines() assert len(lines) == 2 for p in (a_path, b_path): assert any(repr(p) in line for line in lines)
def test_all_exceptions_logging(): @gen.coroutine def throws(): raise Exception('foo1234') with captured_logger('') as sio: try: yield All([throws() for _ in range(5)], quiet_exceptions=Exception) except Exception: pass import gc; gc.collect() yield gen.sleep(0.1) assert 'foo1234' not in sio.getvalue()
def test_logging(): """ Test default logging configuration. """ d = logging.getLogger('distributed') assert len(d.handlers) == 1 assert isinstance(d.handlers[0], logging.StreamHandler) root = logging.getLogger('') dfb = logging.getLogger('distributed.foo.bar') f = logging.getLogger('foo') fb = logging.getLogger('foo.bar') with captured_handler(d.handlers[0]) as distributed_log: with captured_logger(root) as foreign_log: h = logging.StreamHandler(foreign_log) fmt = '[%(levelname)s in %(name)s] - %(message)s' h.setFormatter(logging.Formatter(fmt)) fb.addHandler(h) fb.propagate = False d.debug("1: debug") d.info("2: info") dfb.info("3: info") fb.info("4: info") fb.error("5: error") f.info("6: info") f.error("7: error") distributed_log = distributed_log.getvalue().splitlines() foreign_log = foreign_log.getvalue().splitlines() # distributed log is configured at INFO level by default assert distributed_log == [ "distributed - INFO - 2: info", "distributed.foo.bar - INFO - 3: info", ] # foreign logs should be unaffected by distributed's logging # configuration. They get the default ERROR level from logging. assert foreign_log == [ "[ERROR in foo.bar] - 5: error", "7: error", ]
def test_nanny_terminate(c, s, a): from time import sleep def leak(): L = [] while True: L.append(b'0' * 5000000) sleep(0.01) proc = a.process.pid with captured_logger(logging.getLogger('distributed.nanny')) as logger: future = c.submit(leak) start = time() while a.process.pid == proc: yield gen.sleep(0.1) assert time() < start + 10 out = logger.getvalue() assert 'restart' in out.lower() assert 'memory' in out.lower()
def _workspace_concurrency(base_dir, purged_q, err_q, stop_evt): ws = WorkSpace(base_dir) n_purged = 0 with captured_logger('distributed.diskutils', 'ERROR') as sio: while not stop_evt.is_set(): # Add a bunch of locks, and simulate forgetting them try: purged = ws._purge_leftovers() except Exception as e: err_q.put(e) else: n_purged += len(purged) lines = sio.getvalue().splitlines() if lines: try: raise AssertionError("got %d logs, see stderr" % (len(lines,))) except Exception as e: err_q.put(e) purged_q.put(n_purged)
def test_pause_executor(c, s, a): memory = psutil.Process().memory_info().rss a.memory_limit = memory / 0.8 + 200e6 np = pytest.importorskip('numpy') def f(): x = np.ones(int(300e6), dtype='u1') sleep(1) with captured_logger(logging.getLogger('distributed.worker')) as logger: future = c.submit(f) futures = c.map(slowinc, range(10), delay=0.1) yield gen.sleep(0.3) assert a.paused, (format_bytes(psutil.Process().memory_info().rss), format_bytes(a.memory_limit)) out = logger.getvalue() assert 'memory' in out.lower() assert 'pausing' in out.lower() assert sum(f.status == 'finished' for f in futures) < 4 yield wait(futures)