def test_failure_with_storage(workflow_start_regular): with tempfile.TemporaryDirectory() as temp_dir: debug_store = DebugStorage(temp_dir) _alter_storage(debug_store) wf = construct_workflow(length=3) result = wf.run(workflow_id="complex_workflow") index = _locate_initial_commit(debug_store) + 1 debug_store.log_off() def resume(num_records_replayed): key = debug_store.wrapped_storage.make_key("complex_workflow") asyncio_run(debug_store.wrapped_storage.delete_prefix(key)) async def replay(): # We need to replay one by one to avoid conflict for i in range(num_records_replayed): await debug_store.replay(i) asyncio_run(replay()) return ray.get(workflow.resume(workflow_id="complex_workflow")) with pytest.raises(ValueError): # in cases, the replayed records are too few to resume the # workflow. resume(index - 1) if isinstance(debug_store.wrapped_storage, FilesystemStorageImpl): # filesystem is faster, so we can cover all cases step_len = 1 else: step_len = max((len(debug_store) - index) // 5, 1) for j in range(index, len(debug_store), step_len): assert resume(j) == result
def test_nested_workflow_no_download(workflow_start_regular): """Test that we _only_ load from storage on recovery. For a nested workflow step, we should checkpoint the input/output, but continue to reuse the in-memory value. """ @ray.remote def recursive(ref, count): if count == 0: return ref return workflow.continuation(recursive.bind(ref, count - 1)) with tempfile.TemporaryDirectory() as temp_dir: debug_store = DebugStorage(temp_dir) utils._alter_storage(debug_store) ref = ray.put("hello") result = workflow.create(recursive.bind([ref], 10)).run() ops = debug_store._logged_storage.get_op_counter() get_objects_count = 0 for key in ops["get"]: if "objects" in key: get_objects_count += 1 assert get_objects_count == 1, "We should only get once when resuming." put_objects_count = 0 for key in ops["put"]: if "objects" in key: print(key) put_objects_count += 1 assert (put_objects_count == 1 ), "We should detect the object exists before uploading" assert ray.get(result) == ["hello"]
def create_storage(storage_url: str) -> Storage: """A factory function that creates different type of storage according to the URL. Args: storage_url: A URL indicates the storage type and root path. Currently only two types of storages are supported: local fs and s3 For local fs, a path is needed, it can be either a URI with scheme file:// or just a local path, i.e.: file:///local_path local_path For s3, bucket, path are necessary. In the meantime, other parameters can be passed as well, like credientials or regions, i.e.: s3://bucket/path?region_name=str&endpoint_url=str&aws_access_key_id=str& aws_secret_access_key=str&aws_session_token=str All parameters are optional and have the same meaning as boto3.client Returns: A storage instance. """ parsed_url = parse.urlparse(storage_url) if parsed_url.scheme == "file" or parsed_url.scheme == "": from ray.workflow.storage.filesystem import FilesystemStorageImpl return FilesystemStorageImpl(parsed_url.path) elif parsed_url.scheme == "s3": from ray.workflow.storage.s3 import S3StorageImpl bucket = parsed_url.netloc s3_path = parsed_url.path.lstrip("/") if not s3_path: raise ValueError(f"Invalid s3 path: {s3_path}") params = dict(parse.parse_qsl(parsed_url.query)) return S3StorageImpl(bucket, s3_path, **params) elif parsed_url.scheme == "debug": from ray.workflow.storage.debug import DebugStorage params = dict(parse.parse_qsl(parsed_url.query)) return DebugStorage(create_storage(params["storage"]), path=parsed_url.path) else: extra_msg = "" if os.name == "nt": extra_msg = ( " Try using file://{} or file:///{} for Windows file paths.". format(storage_url, storage_url)) raise ValueError(f"Invalid url: {storage_url}." + extra_msg)
def test_dedupe_download_raw_ref(workflow_start_regular): with tempfile.TemporaryDirectory() as temp_dir: debug_store = DebugStorage(temp_dir) utils._alter_storage(debug_store) ref = ray.put("hello") workflows = [identity.bind(ref) for _ in range(100)] workflow.create(gather.bind(*workflows)).run() ops = debug_store._logged_storage.get_op_counter() get_objects_count = 0 for key in ops["get"]: if "objects" in key: get_objects_count += 1 assert get_objects_count == 1
def test_dedupe_downloads_list(workflow_start_regular): with tempfile.TemporaryDirectory() as temp_dir: debug_store = DebugStorage(get_global_storage(), temp_dir) utils._alter_storage(debug_store) numbers = [ray.put(i) for i in range(5)] workflows = [identity.bind(numbers) for _ in range(100)] workflow.create(gather.bind(*workflows)).run() ops = debug_store._logged_storage.get_op_counter() get_objects_count = 0 for key in ops["get"]: if "objects" in key: get_objects_count += 1 assert get_objects_count == 5
def _locate_initial_commit(debug_store: DebugStorage) -> int: for i in range(len(debug_store)): log = debug_store.get_log(i) if log["key"].endswith(STEP_OUTPUTS_METADATA): return i return -1