def test_worker_task_refs(ray_start_regular): @ray.remote def f(y): x_id = ray.put("HI") info = memory_summary() del x_id return info x_id = f.remote(np.zeros(100000)) info = ray.get(x_id) print(info) assert num_objects(info) == 4, info # Task argument plus task return ids. assert count(info, TASK_CALL_OBJ) == 2, info assert count(info, DRIVER_PID) == 1, info assert count(info, WORKER_PID) == 1, info assert count(info, LOCAL_REF) == 2, info assert count(info, PINNED_IN_MEMORY) == 1, info assert count(info, PUT_OBJ) == 1, info assert count(info, DESER_TASK_ARG) == 1, info assert count(info, UNKNOWN_SIZE) == 1, info assert count(info, "test_memstat.py:f") == 1, info assert count(info, "test_memstat.py:test_worker_task_refs") == 2, info info = memory_summary() print(info) assert num_objects(info) == 1, info assert count(info, DRIVER_PID) == 1, info assert count(info, TASK_CALL_OBJ) == 1, info assert count(info, UNKNOWN_SIZE) == 0, info assert count(info, x_id.hex()) == 1, info del x_id info = memory_summary() assert num_objects(info) == 0, info
def test_pinned_object_call_site(ray_start_regular): address = ray_start_regular["address"] # Local ref only. x_id = ray.put(np.zeros(100000)) info = memory_summary(address) print(info) assert num_objects(info) == 1, info assert count(info, LOCAL_REF) == 1, info assert count(info, PINNED_IN_MEMORY) == 0, info # Local ref + pinned buffer. buf = ray.get(x_id) info = memory_summary(address) print(info) assert num_objects(info) == 1, info assert count(info, LOCAL_REF) == 0, info assert count(info, PINNED_IN_MEMORY) == 1, info # Just pinned buffer. del x_id info = memory_summary(address) print(info) assert num_objects(info) == 1, info assert count(info, LOCAL_REF) == 0, info assert count(info, PINNED_IN_MEMORY) == 1, info # Nothing. del buf info = memory_summary(address) print(info) assert num_objects(info) == 0, info
def f(y): x_id = ray.put("HI") info_a = memory_summary(group_by="STACK_TRACE", sort_by="REFERENCE_TYPE") info_b = memory_summary(group_by="NODE_ADDRESS", sort_by="OBJECT_SIZE") info_c = memory_summary(group_by="NODE_ADDRESS", sort_by="PID") del x_id return info_a, info_b, info_c
def test_memory_release_eager(shutdown_only): info = ray.init(num_cpus=1, object_store_memory=1500e6) ds = ray.data.range(10) # Round 1. ds = ds.map(lambda x: np.ones(100 * 1024 * 1024, dtype=np.uint8)) meminfo = memory_summary(info.address_info["address"], stats_only=True) assert "Spilled" not in meminfo, meminfo # Round 2. ds = ds.map(lambda x: np.ones(100 * 1024 * 1024, dtype=np.uint8)) meminfo = memory_summary(info["address"], stats_only=True)
def test_driver_put_ref(ray_start_regular): info = memory_summary() assert num_objects(info) == 0, info x_id = ray.put("HI") info = memory_summary() print(info) assert num_objects(info) == 1, info assert count(info, DRIVER_PID) == 1, info assert count(info, WORKER_PID) == 0, info del x_id info = memory_summary() assert num_objects(info) == 0, info
def test_multi_node_stats(shutdown_only): cluster = Cluster() for _ in range(2): cluster.add_node(num_cpus=1) ray.init(address=cluster.address) @ray.remote(num_cpus=1) class Actor: def __init__(self): self.ref = ray.put(np.zeros(100000)) def ping(self): pass # Each actor will be on a different node. a = Actor.remote() b = Actor.remote() ray.get(a.ping.remote()) ray.get(b.ping.remote()) # Verify we have collected stats across the nodes. info = memory_summary() print(info) assert count(info, PUT_OBJ) == 2, info
def test_spill_stats(object_spilling_config, shutdown_only): # Limit our object store to 75 MiB of memory. object_spilling_config, _ = object_spilling_config ray.init( num_cpus=1, object_store_memory=100 * 1024 * 1024, _system_config={ "automatic_object_spilling_enabled": True, "max_io_workers": 100, "min_spilling_size": 1, "object_spilling_config": object_spilling_config }, ) @ray.remote def f(): return np.zeros(50 * 1024 * 1024, dtype=np.uint8) ids = [] for _ in range(4): x = f.remote() ids.append(x) while ids: print(ray.get(ids.pop())) x_id = f.remote() # noqa ray.get(x_id) s = memory_summary() assert "Plasma memory usage 50 MiB, 1 objects, 50.0% full" in s, s assert "Spilled 200 MiB, 4 objects" in s, s assert "Restored 150 MiB, 3 objects" in s, s
def ok(): s = memory_summary(address=address["address"], stats_only=True) print(s) if restored: if "Restored {} MiB".format(restored) not in s: return False else: if "Restored" in s: return False if spilled: if not isinstance(spilled, list): spilled_lst = [spilled] else: spilled_lst = spilled found = False for n in spilled_lst: if "Spilled {} MiB".format(n) in s: found = True if not found: return False else: if "Spilled" in s: return False if fallback: if "Plasma filesystem mmap usage: {} MiB".format( fallback) not in s: return False else: if "Plasma filesystem mmap usage:" in s: return False return True
def f(y): from ray.internal.internal_api import memory_summary x_id = ray.put("HI") info = memory_summary(address) del x_id return info
def test_multi_node_stats(shutdown_only): # NOTE(mwtian): using env var only enables the feature on workers, while # using head_node_args={"_system_config": ray_config} only enables the # feature on the driver. os.environ["RAY_record_ref_creation_sites"] = "1" cluster = Cluster() for _ in range(2): cluster.add_node(num_cpus=1) ray.init(address=cluster.address) @ray.remote(num_cpus=1) class Actor: def __init__(self): self.ref = ray.put(np.zeros(100000)) def ping(self): pass # Each actor will be on a different node. a = Actor.remote() b = Actor.remote() ray.get(a.ping.remote()) ray.get(b.ping.remote()) # Verify we have collected stats across the nodes. info = memory_summary(cluster.address) print(info) assert count(info, PUT_OBJ) == 2, info
def test_actor_task_refs(ray_start_regular): address = ray_start_regular["address"] @ray.remote class Actor: def __init__(self): self.refs = [] def f(self, x): from ray.internal.internal_api import memory_summary self.refs.append(x) return memory_summary(address) def make_actor(): return Actor.remote() actor = make_actor() x_id = actor.f.remote(np.zeros(100000)) info = ray.get(x_id) print(info) # Note, the actor will always hold a handle to the actor itself. assert num_objects(info) == 5, info # Actor handle, task argument id, task return id. assert count(info, ACTOR_TASK_CALL_OBJ) == 3, info assert count(info, DRIVER_PID) == 3, info assert count(info, WORKER_PID) == 2, info assert count(info, LOCAL_REF) == 1, info assert count(info, PINNED_IN_MEMORY) == 1, info assert count(info, USED_BY_PENDING_TASK) == 1, info assert count(info, ACTOR_HANDLE) == 2, info assert count(info, DESER_ACTOR_TASK_ARG) == 1, info del x_id # These should accumulate in the actor. for _ in range(5): ray.get(actor.f.remote([ray.put(np.zeros(100000))])) info = memory_summary(address) print(info) assert count(info, DESER_ACTOR_TASK_ARG) == 5, info assert count(info, ACTOR_TASK_CALL_OBJ) == 1, info # Cleanup. del actor time.sleep(1) info = memory_summary(address) assert num_objects(info) == 0, info
def test_memory_used_output(ray_start_regular): import numpy as np _ = ray.put(np.ones(8 * 1024 * 1024, dtype=np.int8)) info = memory_summary() print(info) assert count(info, "Plasma memory usage 8 MiB") == 1, info assert count(info, "8388861 B") == 2, info
def filtered_summary(): return "\n".join( [ line for line in memory_summary(address, line_wrap=False).split("\n") if "ACTOR_HANDLE" not in line ] )
def check_no_spill(ctx, pipe, prefetch_blocks: int = 0): # Run .iter_batches() for 10 secs, and we expect no object spilling. end_time = time.time() + 10 for batch in pipe.iter_batches(prefetch_blocks=prefetch_blocks): if time.time() > end_time: break meminfo = memory_summary(ctx.address_info["address"], stats_only=True) assert "Spilled" not in meminfo, meminfo
def test_spill_stats(object_spilling_config, shutdown_only): # Limit our object store to 75 MiB of memory. object_spilling_config, _ = object_spilling_config address = ray.init( num_cpus=1, object_store_memory=100 * 1024 * 1024, _system_config={ "automatic_object_spilling_enabled": True, "max_io_workers": 100, "min_spilling_size": 1, "object_spilling_config": object_spilling_config, }, ) @ray.remote def f(): return np.zeros(50 * 1024 * 1024, dtype=np.uint8) ids = [] for _ in range(4): x = f.remote() ids.append(x) while ids: print(ray.get(ids.pop())) x_id = f.remote() # noqa ray.get(x_id) s = memory_summary(address=address["address"], stats_only=True) assert "Plasma memory usage 50 MiB, 1 objects, 50.0% full" in s, s assert "Spilled 200 MiB, 4 objects" in s, s assert "Restored 150 MiB, 3 objects" in s, s # Test if consumed bytes are correctly calculated. obj = ray.put(np.zeros(30 * 1024 * 1024, dtype=np.uint8)) @ray.remote def func_with_ref(obj): return True ray.get(func_with_ref.remote(obj)) s = memory_summary(address=address["address"], stats_only=True) # 50MB * 5 references + 30MB used for task execution. assert "Objects consumed by Ray tasks: 280 MiB." in s, s assert_no_thrashing(address["address"])
def test_memory_sanity(shutdown_only): info = ray.init(num_cpus=1, object_store_memory=500e6) ds = ray.data.range(10) ds = ds.map(lambda x: np.ones(100 * 1024 * 1024, dtype=np.uint8)) meminfo = memory_summary(info.address_info["address"], stats_only=True) # Sanity check spilling is happening as expected. assert "Spilled" in meminfo, meminfo
def test_memory_release_pipeline(shutdown_only, lazy_input): context = DatasetContext.get_current() # Disable stage fusion so we can keep reads and maps from being fused together, # since we're trying to test multi-stage memory releasing here. context.optimize_fuse_stages = False # This object store allocation can hold at most 1 copy of the transformed dataset. if lazy_input: object_store_memory = 3000e6 else: object_store_memory = 3000e6 n = 10 info = ray.init(num_cpus=n, object_store_memory=object_store_memory) if lazy_input: ds = ray.data.read_datasource( OnesSource(), parallelism=n, n_per_block=100 * 1024 * 1024, ) else: ds = ray.data.from_items(list(range(n)), parallelism=n) # Create a single-window pipeline. pipe = ds.window(blocks_per_window=n) # Round 1. def gen(x): import time # TODO(Clark): Remove this sleep once we have fixed memory pressure handling. time.sleep(2) if isinstance(x, np.ndarray): return x else: return np.ones(100 * 1024 * 1024, dtype=np.uint8) pipe = pipe.map(gen) def inc(x): import time # TODO(Clark): Remove this sleep once we have fixed memory pressure handling. time.sleep(2) return x + 1 num_rounds = 10 for _ in range(num_rounds): pipe = pipe.map(inc) for block in pipe.iter_batches(batch_size=None): for arr in block: np.testing.assert_equal( arr, np.ones(100 * 1024 * 1024, dtype=np.uint8) + num_rounds, ) meminfo = memory_summary(info["address"], stats_only=True) assert "Spilled" not in meminfo, meminfo
def test_actor_task_refs(ray_start_regular): @ray.remote class Actor: def __init__(self): self.refs = [] def f(self, x): self.refs.append(x) return memory_summary() def make_actor(): return Actor.remote() actor = make_actor() x_id = actor.f.remote(np.zeros(100000)) info = ray.get(x_id) print(info) assert num_objects(info) == 4, info # Actor handle, task argument id, task return id. assert count(info, ACTOR_TASK_CALL_OBJ) == 3, info assert count(info, DRIVER_PID) == 1, info assert count(info, WORKER_PID) == 1, info assert count(info, LOCAL_REF) == 1, info assert count(info, PINNED_IN_MEMORY) == 1, info assert count(info, USED_BY_PENDING_TASK) == 2, info assert count(info, DESER_ACTOR_TASK_ARG) == 1, info assert count(info, "test_memstat.py:test_actor_task_refs") == 3, info assert count(info, "test_memstat.py:make_actor") == 1, info del x_id # These should accumulate in the actor. for _ in range(5): ray.get(actor.f.remote([ray.put(np.zeros(100000))])) info = memory_summary() print(info) assert count(info, DESER_ACTOR_TASK_ARG) == 5, info assert count(info, ACTOR_TASK_CALL_OBJ) == 1, info # Cleanup. del actor time.sleep(1) info = memory_summary() assert num_objects(info) == 0, info
def test_nested_object_refs(ray_start_regular): x_id = ray.put(np.zeros(100000)) y_id = ray.put([x_id]) z_id = ray.put([y_id]) del x_id, y_id info = memory_summary() print(info) assert num_objects(info) == 3, info assert count(info, LOCAL_REF) == 1, info assert count(info, CAPTURED_IN_OBJECT) == 2, info del z_id
def test_memory_release_lazy(shutdown_only): info = ray.init(num_cpus=1, object_store_memory=1500e6) ds = ray.data.range(10) # Should get fused into single stage. ds = ds._experimental_lazy() ds = ds.map(lambda x: np.ones(100 * 1024 * 1024, dtype=np.uint8)) ds = ds.map(lambda x: np.ones(100 * 1024 * 1024, dtype=np.uint8)) ds = ds.map(lambda x: np.ones(100 * 1024 * 1024, dtype=np.uint8)) ds.fully_executed() meminfo = memory_summary(info.address_info["address"], stats_only=True) assert "Spilled" not in meminfo, meminfo
def test_worker_task_refs(ray_start_regular): address = ray_start_regular["address"] @ray.remote def f(y): from ray.internal.internal_api import memory_summary x_id = ray.put("HI") info = memory_summary(address) del x_id return info x_id = f.remote(np.zeros(100000)) info = ray.get(x_id) print(info) assert num_objects(info) == 4, info # Task argument plus task return ids. assert count(info, TASK_CALL_OBJ) == 2, info assert count(info, DRIVER_PID) == 2, info assert count(info, WORKER_PID) == 2, info assert count(info, LOCAL_REF) == 2, info assert count(info, PINNED_IN_MEMORY) == 1, info assert count(info, PUT_OBJ) == 1, info assert count(info, DESER_TASK_ARG) == 1, info assert count(info, UNKNOWN_SIZE) == 1, info print(ray_start_regular) info = memory_summary(address) print(info) assert num_objects(info) == 1, info assert count(info, DRIVER_PID) == 1, info assert count(info, TASK_CALL_OBJ) == 1, info assert count(info, UNKNOWN_SIZE) == 0, info assert count(info, x_id.hex()) == 1, info del x_id info = memory_summary(address) assert num_objects(info) == 0, info
def assert_no_thrashing(address): state = ray.state.GlobalState() options = GcsClientOptions.from_gcs_address(address) state._initialize_global_state(options) summary = memory_summary(address=address, stats_only=True) restored_bytes = 0 consumed_bytes = 0 for line in summary.split("\n"): if "Restored" in line: restored_bytes = int(line.split(" ")[1]) if "consumed" in line: consumed_bytes = int(line.split(" ")[-2]) assert (consumed_bytes >= restored_bytes ), f"consumed: {consumed_bytes}, restored: {restored_bytes}"
def test_memory_release_lazy(shutdown_only): context = DatasetContext.get_current() # Ensure that stage fusion is enabled. context.optimize_fuse_stages = True info = ray.init(num_cpus=1, object_store_memory=1500e6) ds = ray.data.range(10) # Should get fused into single stage. ds = ds.experimental_lazy() ds = ds.map(lambda x: np.ones(100 * 1024 * 1024, dtype=np.uint8)) ds = ds.map(lambda x: np.ones(100 * 1024 * 1024, dtype=np.uint8)) ds = ds.map(lambda x: np.ones(100 * 1024 * 1024, dtype=np.uint8)) ds.fully_executed() meminfo = memory_summary(info.address_info["address"], stats_only=True) assert "Spilled" not in meminfo, meminfo
def assert_no_thrashing(address): state = ray.state.GlobalState() state._initialize_global_state(address, ray.ray_constants.REDIS_DEFAULT_PASSWORD) summary = memory_summary(address=address, stats_only=True) restored_bytes = 0 consumed_bytes = 0 for line in summary.split("\n"): if "Restored" in line: restored_bytes = int(line.split(" ")[1]) if "consumed" in line: consumed_bytes = int(line.split(" ")[-2]) assert consumed_bytes >= restored_bytes, ( f"consumed: {consumed_bytes}, restored: {restored_bytes}")
def ok(): s = memory_summary(address=address["redis_address"], stats_only=True) print(s) if restored: if "Restored {} MiB".format(restored) not in s: return False else: if "Restored" in s: return False if spilled: if "Spilled {} MiB".format(spilled) not in s: return False else: if "Spilled" in s: return False return True
def test_spill_stats(tmp_path, shutdown_only): # Limit our object store to 75 MiB of memory. temp_folder = tmp_path / "spill" temp_folder.mkdir() ray.init( num_cpus=1, object_store_memory=100 * 1024 * 1024, _system_config={ "automatic_object_spilling_enabled": True, "max_io_workers": 100, "min_spilling_size": 1, "object_spilling_config": json.dumps( { "type": "filesystem", "params": { "directory_path": str(temp_folder) } }, separators=(",", ":")) }, ) @ray.remote def f(): return np.zeros(50 * 1024 * 1024, dtype=np.uint8) ids = [] for _ in range(4): x = f.remote() ids.append(x) while ids: print(ray.get(ids.pop())) x_id = f.remote() # noqa ray.get(x_id) s = memory_summary() assert "Plasma memory usage 50 MiB, 1 objects, 50.0% full" in s, s assert "Spilled 200 MiB, 4 objects" in s, s assert "Restored 150 MiB, 3 objects" in s, s
def stats(): info = memory_summary(cluster.address, line_wrap=False) info = info.split("\n") reconstructing_waiting = [ line for line in info if "Attempt #2" in line and WAITING_FOR_DEPENDENCIES in line ] reconstructing_scheduled = [ line for line in info if "Attempt #2" in line and SCHEDULED in line ] reconstructing_finished = [ line for line in info if "Attempt #2" in line and FINISHED in line ] return ( len(reconstructing_waiting), len(reconstructing_scheduled), len(reconstructing_finished), )
def assert_no_thrashing(address): state = ray.state.GlobalState() if use_gcs_for_bootstrap(): options = GcsClientOptions.from_gcs_address(address) else: options = GcsClientOptions.from_redis_address( address, ray.ray_constants.REDIS_DEFAULT_PASSWORD) state._initialize_global_state(options) summary = memory_summary(address=address, stats_only=True) restored_bytes = 0 consumed_bytes = 0 for line in summary.split("\n"): if "Restored" in line: restored_bytes = int(line.split(" ")[1]) if "consumed" in line: consumed_bytes = int(line.split(" ")[-2]) assert (consumed_bytes >= restored_bytes ), f"consumed: {consumed_bytes}, restored: {restored_bytes}"
def test_memory_release_lazy_shuffle(shutdown_only): # TODO(ekl) why is this flaky? Due to eviction delay? error = None for trial in range(3): print("Try", trial) try: info = ray.init(num_cpus=1, object_store_memory=1800e6) ds = ray.data.range(10) # Should get fused into single stage. ds = ds._experimental_lazy() ds = ds.map(lambda x: np.ones(100 * 1024 * 1024, dtype=np.uint8)) ds.random_shuffle().fully_executed() meminfo = memory_summary(info.address_info["address"], stats_only=True) assert "Spilled" not in meminfo, meminfo return except Exception as e: error = e print("Failed", e) finally: ray.shutdown() raise error
def ok(): s = memory_summary(address=address["redis_address"], stats_only=True) print(s) if restored: if "Restored {} MiB".format(restored) not in s: return False else: if "Restored" in s: return False if spilled: if "Spilled {} MiB".format(spilled) not in s: return False else: if "Spilled" in s: return False if fallback: if "Plasma filesystem mmap usage: {} MiB".format( fallback) not in s: return False else: if "Plasma filesystem mmap usage:" in s: return False return True