def test_chaos_task_retry(ray_start_chaos_cluster): # Chaos testing. @ray.remote(max_retries=-1) def task(): def generate_data(size_in_kb=10): return np.zeros(1024 * size_in_kb, dtype=np.uint8) a = "" for _ in range(100000): a = a + random.choice(string.ascii_letters) return generate_data(size_in_kb=50) @ray.remote(max_retries=-1) def invoke_nested_task(): time.sleep(0.8) return ray.get(task.remote()) # 50MB of return values. TOTAL_TASKS = 300 pb = ProgressBar("Chaos test sanity check", TOTAL_TASKS) results = [invoke_nested_task.remote() for _ in range(TOTAL_TASKS)] start = time.time() pb.block_until_complete(results) runtime_with_failure = time.time() - start print(f"Runtime when there are many failures: {runtime_with_failure}") pb.close()
def run_task_workload(total_num_cpus, smoke): """Run task-based workload that doesn't require object reconstruction. """ @ray.remote(num_cpus=1, max_retries=-1) def task(): def generate_data(size_in_kb=10): return np.zeros(1024 * size_in_kb, dtype=np.uint8) a = "" for _ in range(100000): a = a + random.choice(string.ascii_letters) return generate_data(size_in_kb=50) @ray.remote(num_cpus=1, max_retries=-1) def invoke_nested_task(): time.sleep(0.8) return ray.get(task.remote()) multiplier = 75 # For smoke mode, run less number of tasks if smoke: multiplier = 1 TOTAL_TASKS = int(total_num_cpus * 2 * multiplier) pb = ProgressBar("Chaos test", TOTAL_TASKS) results = [invoke_nested_task.remote() for _ in range(TOTAL_TASKS)] pb.block_until_complete(results) pb.close() # Consistency check. wait_for_condition(lambda: (ray.cluster_resources().get("CPU", 0) == ray. available_resources().get("CPU", 0)), timeout=60)
def sample_boundaries(blocks: BlockList[T], key: SortKeyT, num_reducers: int) -> List[T]: """ Return (num_reducers - 1) items in ascending order from the blocks that partition the domain into ranges with approximately equally many elements. """ n_samples = int(num_reducers * 10 / len(blocks)) sample_block = cached_remote_fn(_sample_block) sample_results = [ sample_block.remote(block, n_samples, key) for block in blocks ] sample_bar = ProgressBar("Sort Sample", len(sample_results)) sample_bar.block_until_complete(sample_results) sample_bar.close() samples = ray.get(sample_results) sample_items = np.concatenate(samples) sample_items.sort() ret = [ np.quantile(sample_items, q, interpolation="nearest") for q in np.arange(0, 1, 1 / num_reducers) ] return ret[1:]
def do_agg(blocks, clear_input_blocks: bool, block_udf): # TODO: implement clear_input_blocks stage_info = {} if len(aggs) == 0: raise ValueError("Aggregate requires at least one aggregation") for agg in aggs: agg._validate(self._dataset) # Handle empty dataset. if blocks.initial_num_blocks() == 0: return blocks, stage_info num_mappers = blocks.initial_num_blocks() num_reducers = num_mappers if self._key is None: num_reducers = 1 boundaries = [] else: boundaries = sort.sample_boundaries( blocks.get_blocks(), [(self._key, "ascending")] if isinstance(self._key, str) else self._key, num_reducers, ) partition_and_combine_block = cached_remote_fn( _partition_and_combine_block).options( num_returns=num_reducers + 1) aggregate_combined_blocks = cached_remote_fn( _aggregate_combined_blocks, num_returns=2) map_results = np.empty((num_mappers, num_reducers), dtype=object) map_meta = [] for i, block in enumerate(blocks.get_blocks()): results = partition_and_combine_block.remote( block, boundaries, self._key, aggs) map_results[i, :] = results[:-1] map_meta.append(results[-1]) map_bar = ProgressBar("GroupBy Map", len(map_results)) map_bar.block_until_complete(map_meta) stage_info["map"] = ray.get(map_meta) map_bar.close() blocks = [] metadata = [] for j in range(num_reducers): block, meta = aggregate_combined_blocks.remote( num_reducers, self._key, aggs, *map_results[:, j].tolist()) blocks.append(block) metadata.append(meta) reduce_bar = ProgressBar("GroupBy Reduce", len(blocks)) reduce_bar.block_until_complete(blocks) reduce_bar.close() metadata = ray.get(metadata) stage_info["reduce"] = metadata return BlockList(blocks, metadata), stage_info
def sort_impl(blocks: BlockList, key: SortKeyT, descending: bool = False) -> Tuple[BlockList, dict]: stage_info = {} blocks = blocks.get_blocks() if len(blocks) == 0: return BlockList([], []), stage_info if isinstance(key, str): key = [(key, "descending" if descending else "ascending")] if isinstance(key, list): descending = key[0][1] == "descending" num_mappers = len(blocks) num_reducers = num_mappers boundaries = sample_boundaries(blocks, key, num_reducers) if descending: boundaries.reverse() sort_block = cached_remote_fn(_sort_block).options( num_returns=num_reducers + 1) merge_sorted_blocks = cached_remote_fn(_merge_sorted_blocks, num_returns=2) map_results = np.empty((num_mappers, num_reducers), dtype=object) map_meta = [] for i, block in enumerate(blocks): result = sort_block.remote(block, boundaries, key, descending) map_results[i, :] = result[:-1] map_meta.append(result[-1]) # Early release memory. del blocks map_bar = ProgressBar("Sort Map", len(map_results)) map_bar.block_until_complete(map_meta) map_bar.close() stage_info["map"] = ray.get(map_meta) reduce_results = [] for j in range(num_reducers): ret = merge_sorted_blocks.remote(key, descending, *map_results[:, j].tolist()) reduce_results.append(ret) # Early release memory. del map_results merge_bar = ProgressBar("Sort Merge", len(reduce_results)) merge_bar.block_until_complete([ret[0] for ret in reduce_results]) merge_bar.close() blocks = [b for b, _ in reduce_results] metadata = ray.get([m for _, m in reduce_results]) stage_info["merge"] = metadata return BlockList(blocks, metadata), stage_info
def apply(self, fn: Any, remote_args: dict, blocks: BlockList[Any]) -> BlockList[Any]: map_bar = ProgressBar("Map Progress", total=len(blocks)) kwargs = remote_args.copy() kwargs["num_returns"] = 2 map_block = cached_remote_fn(_map_block) refs = [ map_block.options(**kwargs).remote(b, m, fn) for b, m in zip(blocks, blocks.get_metadata()) ] new_blocks, new_metadata = zip(*refs) map_bar.block_until_complete(list(new_blocks)) new_metadata = ray.get(list(new_metadata)) return BlockList(list(new_blocks), list(new_metadata))
def sort_impl(blocks: BlockList, key: SortKeyT, descending: bool = False) -> BlockList: blocks = list(blocks.iter_blocks()) if len(blocks) == 0: return BlockList([], []) if isinstance(key, str): key = [(key, "descending" if descending else "ascending")] if isinstance(key, list): descending = key[0][1] == "descending" num_mappers = len(blocks) num_reducers = num_mappers boundaries = sample_boundaries(blocks, key, num_reducers) if descending: boundaries.reverse() sort_block = cached_remote_fn(_sort_block).options( num_returns=num_reducers) merge_sorted_blocks = cached_remote_fn(_merge_sorted_blocks, num_returns=2) map_results = np.empty((num_mappers, num_reducers), dtype=object) for i, block in enumerate(blocks): map_results[i, :] = sort_block.remote(block, boundaries, key, descending) map_bar = ProgressBar("Sort Map", len(map_results)) map_bar.block_until_complete([ret[0] for ret in map_results]) map_bar.close() reduce_results = [] for j in range(num_reducers): ret = merge_sorted_blocks.remote(key, descending, *map_results[:, j].tolist()) reduce_results.append(ret) merge_bar = ProgressBar("Sort Merge", len(reduce_results)) merge_bar.block_until_complete([ret[0] for ret in reduce_results]) merge_bar.close() blocks = [b for b, _ in reduce_results] metadata = ray.get([m for _, m in reduce_results]) return BlockList(blocks, metadata)
def simple_shuffle(input_blocks: BlockList[T], output_num_blocks: int, *, random_shuffle: bool = False, random_seed: Optional[int] = None) -> BlockList[T]: input_num_blocks = len(input_blocks) shuffle_map = cached_remote_fn(_shuffle_map).options( num_returns=output_num_blocks) shuffle_reduce = cached_remote_fn(_shuffle_reduce, num_returns=2) map_bar = ProgressBar("Shuffle Map", position=0, total=input_num_blocks) shuffle_map_out = [ shuffle_map.remote(block, i, output_num_blocks, random_shuffle, random_seed) for i, block in enumerate(input_blocks) ] if output_num_blocks == 1: # Handle the num_returns=1 edge case which doesn't return a list. shuffle_map_out = [[x] for x in shuffle_map_out] map_bar.block_until_complete([x[0] for x in shuffle_map_out]) map_bar.close() # Randomize the reduce order of the blocks. if random_shuffle: random = np.random.RandomState(random_seed) random.shuffle(shuffle_map_out) reduce_bar = ProgressBar("Shuffle Reduce", position=0, total=output_num_blocks) shuffle_reduce_out = [ shuffle_reduce.remote( *[shuffle_map_out[i][j] for i in range(input_num_blocks)]) for j in range(output_num_blocks) ] new_blocks, new_metadata = zip(*shuffle_reduce_out) reduce_bar.block_until_complete(list(new_blocks)) new_metadata = ray.get(list(new_metadata)) reduce_bar.close() return BlockList(list(new_blocks), list(new_metadata))
def sample_boundaries(blocks: List[ObjectRef[Block]], key: SortKeyT, num_reducers: int) -> List[T]: """ Return (num_reducers - 1) items in ascending order from the blocks that partition the domain into ranges with approximately equally many elements. """ # TODO(Clark): Support multiple boundary sampling keys. if isinstance(key, list) and len(key) > 1: raise ValueError("Multiple boundary sampling keys not supported.") n_samples = int(num_reducers * 10 / len(blocks)) sample_block = cached_remote_fn(_sample_block) sample_results = [ sample_block.remote(block, n_samples, key) for block in blocks ] sample_bar = ProgressBar("Sort Sample", len(sample_results)) sample_bar.block_until_complete(sample_results) sample_bar.close() samples = ray.get(sample_results) samples = [s for s in samples if len(s) > 0] # The dataset is empty if len(samples) == 0: return [None] * (num_reducers - 1) builder = DelegatingArrowBlockBuilder() for sample in samples: builder.add_block(sample) samples = builder.build() column = key[0][0] if isinstance(key, list) else None sample_items = BlockAccessor.for_block(samples).to_numpy(column) sample_items = np.sort(sample_items) ret = [ np.quantile(sample_items, q, interpolation="nearest") for q in np.linspace(0, 1, num_reducers) ] return ret[1:]
def test_chaos_task_retry(set_kill_interval): # Chaos testing. @ray.remote(max_retries=-1) def task(): a = "" for _ in range(100000): a = a + random.choice(string.ascii_letters) return @ray.remote(max_retries=-1) def invoke_nested_task(): time.sleep(0.8) return ray.get(task.remote()) # 50MB of return values. TOTAL_TASKS = 100 pb = ProgressBar("Chaos test sanity check", TOTAL_TASKS) results = [invoke_nested_task.remote() for _ in range(TOTAL_TASKS)] start = time.time() pb.block_until_complete(results) runtime_with_failure = time.time() - start print(f"Runtime when there are many failures: {runtime_with_failure}") pb.close()
def simple_shuffle( input_blocks: BlockList, block_udf: Optional[Callable[[Block], Iterable[Block]]], output_num_blocks: int, *, random_shuffle: bool = False, random_seed: Optional[int] = None, map_ray_remote_args: Optional[Dict[str, Any]] = None, reduce_ray_remote_args: Optional[Dict[str, Any]] = None, _spread_resource_prefix: Optional[str] = None ) -> Tuple[BlockList, Dict[str, List[BlockMetadata]]]: input_blocks = input_blocks.get_blocks() if map_ray_remote_args is None: map_ray_remote_args = {} if reduce_ray_remote_args is None: reduce_ray_remote_args = {} if "scheduling_strategy" not in reduce_ray_remote_args: reduce_ray_remote_args = reduce_ray_remote_args.copy() reduce_ray_remote_args["scheduling_strategy"] = "SPREAD" input_num_blocks = len(input_blocks) if _spread_resource_prefix is not None: # Use given spread resource prefix for round-robin resource-based # scheduling. nodes = ray.nodes() map_resource_iter = _get_spread_resources_iter( nodes, _spread_resource_prefix, map_ray_remote_args ) reduce_resource_iter = _get_spread_resources_iter( nodes, _spread_resource_prefix, reduce_ray_remote_args ) else: # If no spread resource prefix given, yield an empty dictionary. map_resource_iter, reduce_resource_iter = itertools.tee(itertools.repeat({}), 2) shuffle_map = cached_remote_fn(_shuffle_map) shuffle_reduce = cached_remote_fn(_shuffle_reduce) map_bar = ProgressBar("Shuffle Map", position=0, total=input_num_blocks) shuffle_map_out = [ shuffle_map.options( **map_ray_remote_args, num_returns=1 + output_num_blocks, resources=next(map_resource_iter) ).remote(block, block_udf, i, output_num_blocks, random_shuffle, random_seed) for i, block in enumerate(input_blocks) ] # The first item returned is the BlockMetadata. shuffle_map_metadata = [] for i, refs in enumerate(shuffle_map_out): shuffle_map_metadata.append(refs[0]) shuffle_map_out[i] = refs[1:] # Eagerly delete the input block references in order to eagerly release # the blocks' memory. del input_blocks shuffle_map_metadata = map_bar.fetch_until_complete(shuffle_map_metadata) map_bar.close() # Randomize the reduce order of the blocks. if random_shuffle: random = np.random.RandomState(random_seed) random.shuffle(shuffle_map_out) reduce_bar = ProgressBar("Shuffle Reduce", position=0, total=output_num_blocks) shuffle_reduce_out = [ shuffle_reduce.options( **reduce_ray_remote_args, num_returns=2, resources=next(reduce_resource_iter) ).remote(*[shuffle_map_out[i][j] for i in range(input_num_blocks)]) for j in range(output_num_blocks) ] # Eagerly delete the map block references in order to eagerly release # the blocks' memory. del shuffle_map_out new_blocks, new_metadata = zip(*shuffle_reduce_out) reduce_bar.block_until_complete(list(new_blocks)) new_metadata = ray.get(list(new_metadata)) reduce_bar.close() stats = { "map": shuffle_map_metadata, "reduce": new_metadata, } return BlockList(list(new_blocks), list(new_metadata)), stats
def simple_shuffle( input_blocks: BlockList[T], output_num_blocks: int, *, random_shuffle: bool = False, random_seed: Optional[int] = None, map_ray_remote_args: Optional[Dict[str, Any]] = None, reduce_ray_remote_args: Optional[Dict[str, Any]] = None, _spread_resource_prefix: Optional[str] = None) -> BlockList[T]: if map_ray_remote_args is None: map_ray_remote_args = {} if reduce_ray_remote_args is None: reduce_ray_remote_args = {} input_num_blocks = len(input_blocks) if _spread_resource_prefix is not None: # Use given spread resource prefix for round-robin resource-based # scheduling. nodes = ray.nodes() map_resource_iter = _get_spread_resources_iter( nodes, _spread_resource_prefix, map_ray_remote_args) reduce_resource_iter = _get_spread_resources_iter( nodes, _spread_resource_prefix, reduce_ray_remote_args) else: # If no spread resource prefix given, yield an empty dictionary. map_resource_iter, reduce_resource_iter = itertools.tee( itertools.repeat({}), 2) shuffle_map = cached_remote_fn(_shuffle_map) shuffle_reduce = cached_remote_fn(_shuffle_reduce) map_bar = ProgressBar("Shuffle Map", position=0, total=input_num_blocks) shuffle_map_out = [ shuffle_map.options(**map_ray_remote_args, num_returns=output_num_blocks, resources=next(map_resource_iter)).remote( block, i, output_num_blocks, random_shuffle, random_seed) for i, block in enumerate(input_blocks) ] # Eagerly delete the input block references in order to eagerly release # the blocks' memory. del input_blocks if output_num_blocks == 1: # Handle the num_returns=1 edge case which doesn't return a list. shuffle_map_out = [[x] for x in shuffle_map_out] map_bar.block_until_complete([x[0] for x in shuffle_map_out]) map_bar.close() # Randomize the reduce order of the blocks. if random_shuffle: random = np.random.RandomState(random_seed) random.shuffle(shuffle_map_out) reduce_bar = ProgressBar("Shuffle Reduce", position=0, total=output_num_blocks) shuffle_reduce_out = [ shuffle_reduce.options(**reduce_ray_remote_args, num_returns=2, resources=next(reduce_resource_iter)). remote(*[shuffle_map_out[i][j] for i in range(input_num_blocks)]) for j in range(output_num_blocks) ] # Eagerly delete the map block references in order to eagerly release # the blocks' memory. del shuffle_map_out new_blocks, new_metadata = zip(*shuffle_reduce_out) reduce_bar.block_until_complete(list(new_blocks)) new_metadata = ray.get(list(new_metadata)) reduce_bar.close() return BlockList(list(new_blocks), list(new_metadata))
def aggregate(self, *aggs: AggregateFn) -> Dataset[U]: """Implements the accumulator-based aggregation. This is a blocking operation. Examples: >>> grouped_ds.aggregate(AggregateFn( ... init=lambda k: [], ... accumulate=lambda a, r: a + [r], ... merge=lambda a1, a2: a1 + a2, ... finalize=lambda a: a ... )) Args: aggs: Aggregations to do. Returns: If the input dataset is simple dataset then the output is a simple dataset of (k, v_1, ..., v_n) tuples where k is the groupby key and v_i is the result of the ith given aggregation. If the input dataset is Arrow dataset then the output is an Arrow dataset of n + 1 columns where first column is the groupby key and the second through n + 1 columns are the results of the aggregations. If groupby key is None then the key part of return is omitted. """ if len(aggs) == 0: raise ValueError("Aggregate requires at least one aggregation") # Handle empty dataset. if self._dataset.num_blocks() == 0: return self._dataset blocks = list(self._dataset._blocks.iter_blocks()) num_mappers = len(blocks) num_reducers = num_mappers if self._key is None: num_reducers = 1 boundaries = [] else: boundaries = sort.sample_boundaries( blocks, [(self._key, "ascending")] if isinstance(self._key, str) else self._key, num_reducers) partition_and_combine_block = cached_remote_fn( _partition_and_combine_block).options(num_returns=num_reducers) aggregate_combined_blocks = cached_remote_fn( _aggregate_combined_blocks, num_returns=2) map_results = np.empty((num_mappers, num_reducers), dtype=object) for i, block in enumerate(blocks): map_results[i, :] = partition_and_combine_block.remote( block, boundaries, self._key, aggs) map_bar = ProgressBar("GroupBy Map", len(map_results)) map_bar.block_until_complete([ret[0] for ret in map_results]) map_bar.close() blocks = [] metadata = [] for j in range(num_reducers): block, meta = aggregate_combined_blocks.remote( num_reducers, self._key, aggs, *map_results[:, j].tolist()) blocks.append(block) metadata.append(meta) reduce_bar = ProgressBar("GroupBy Reduce", len(blocks)) reduce_bar.block_until_complete(blocks) reduce_bar.close() metadata = ray.get(metadata) return Dataset(BlockList(blocks, metadata), self._dataset._epoch)
def simple_shuffle( input_blocks: BlockList[T], output_num_blocks: int, *, random_shuffle: bool = False, random_seed: Optional[int] = None, map_ray_remote_args: Optional[Dict[str, Any]] = None, reduce_ray_remote_args: Optional[Dict[str, Any]] = None) -> BlockList[T]: # Check for spread resource labels in environment variable, and use # the given labels for round-robin resource-based scheduling. shuffle_spread_custom_resource_labels = os.getenv( "RAY_DATASETS_SHUFFLE_SPREAD_CUSTOM_RESOURCE_LABELS", None) if shuffle_spread_custom_resource_labels is not None: shuffle_spread_custom_resource_labels = ( shuffle_spread_custom_resource_labels.split(",")) round_robin_resource_provider = itertools.cycle( map(lambda resource: {resource: 0.001}, shuffle_spread_custom_resource_labels)) else: # If no round-robin resource provider given, yield an empty # dictionary. round_robin_resource_provider = itertools.repeat({}) # Create separate resource iterators for the map and reduce stages. map_resource_iter, reduce_resource_iter = itertools.tee( round_robin_resource_provider, 2) if map_ray_remote_args is None: map_ray_remote_args = {} if reduce_ray_remote_args is None: reduce_ray_remote_args = {} input_num_blocks = len(input_blocks) shuffle_map = cached_remote_fn(_shuffle_map) shuffle_reduce = cached_remote_fn(_shuffle_reduce) map_bar = ProgressBar("Shuffle Map", position=0, total=input_num_blocks) shuffle_map_out = [ shuffle_map.options(**map_ray_remote_args, num_returns=output_num_blocks, resources=next(map_resource_iter)).remote( block, i, output_num_blocks, random_shuffle, random_seed) for i, block in enumerate(input_blocks) ] # Eagerly delete the input block references in order to eagerly release # the blocks' memory. del input_blocks if output_num_blocks == 1: # Handle the num_returns=1 edge case which doesn't return a list. shuffle_map_out = [[x] for x in shuffle_map_out] map_bar.block_until_complete([x[0] for x in shuffle_map_out]) map_bar.close() # Randomize the reduce order of the blocks. if random_shuffle: random = np.random.RandomState(random_seed) random.shuffle(shuffle_map_out) reduce_bar = ProgressBar("Shuffle Reduce", position=0, total=output_num_blocks) shuffle_reduce_out = [ shuffle_reduce.options(**reduce_ray_remote_args, num_returns=2, resources=next(reduce_resource_iter)). remote(*[shuffle_map_out[i][j] for i in range(input_num_blocks)]) for j in range(output_num_blocks) ] # Eagerly delete the map block references in order to eagerly release # the blocks' memory. del shuffle_map_out new_blocks, new_metadata = zip(*shuffle_reduce_out) reduce_bar.block_until_complete(list(new_blocks)) new_metadata = ray.get(list(new_metadata)) reduce_bar.close() return BlockList(list(new_blocks), list(new_metadata))