def _create_ml_dataset(name: str, record_pieces: List[RecordPiece], record_sizes: List[int], num_shards: int, shuffle: bool, shuffle_seed: int, RecordBatchCls, node_hints: List[str] = None) -> MLDataset: if node_hints is not None: assert num_shards % len(node_hints) == 0,\ f"num_shards: {num_shards} should be a multiple of length of node_hints: {node_hints}" if shuffle_seed: np.random.seed(shuffle_seed) else: np.random.seed(0) # split the piece into num_shards partitions divided_blocks = divide_blocks(blocks=record_sizes, world_size=num_shards, shuffle=shuffle, shuffle_seed=shuffle_seed) record_batches = [] for rank, blocks in divided_blocks.items(): pieces = [] for index, num_samples in blocks: record_size = record_sizes[index] piece = record_pieces[index] if num_samples != record_size: assert num_samples < record_size new_row_ids = np.random.choice( record_size, size=num_samples).tolist() piece = piece.with_row_ids(new_row_ids) pieces.append(piece) if shuffle: np.random.shuffle(pieces) record_batches.append(RecordBatchCls(shard_id=rank, prefix=name, record_pieces=pieces, shuffle=shuffle, shuffle_seed=shuffle_seed)) worker_cls = ray.remote(ParallelIteratorWorkerWithLen) if node_hints is not None: actors = [] multiplier = num_shards // len(node_hints) resource_keys = [f"node:{node_hints[i // multiplier]}" for i in range(num_shards)] for g, resource_key in zip(record_batches, resource_keys): actor = worker_cls.options(resources={resource_key: 0.01}).remote(g, False, len(g)) actors.append(actor) else: worker_cls = ray.remote(ParallelIteratorWorkerWithLen) actors = [worker_cls.remote(g, False, len(g)) for g in record_batches] it = parallel_it.from_actors(actors, name) ds = ml_dataset.from_parallel_iter( it, need_convert=False, batch_size=0, repeated=False) return ds
def _save_spark_df_to_object_store(df: sql.DataFrame, num_shards: int) -> List["RecordBatchShard"]: # call java function from python jvm = df.sql_ctx.sparkSession.sparkContext._jvm jdf = df._jdf object_store_writer = jvm.org.apache.spark.sql.raydp.ObjectStoreWriter(jdf) records = object_store_writer.save() worker = ray.worker.global_worker blocks: List[ray.ObjectRef] = [] block_sizes: List[int] = [] for record in records: owner_address = record.ownerAddress() object_ref = ray.ObjectRef(record.objectId()) num_records = record.numRecords() # Register the ownership of the ObjectRef worker.core_worker.deserialize_and_register_object_ref( object_ref.binary(), ray.ObjectRef.nil(), owner_address) blocks.append(object_ref) block_sizes.append(num_records) divided_blocks = divide_blocks(block_sizes, num_shards) record_batch_set: List[RecordBatchShard] = [] for i in range(num_shards): indexes = divided_blocks[i] object_ids = [blocks[index] for index in indexes] record_batch_set.append(RecordBatchShard(i, object_ids)) return record_batch_set
def test_divide_blocks(): blocks = [5, 10, 9] world_size = 3 def _sum(packed_indexes) -> int: return sum([len(i) for i in packed_indexes]) # no shuffle, no pack divided_blocks = utils.divide_blocks(blocks, world_size, None, False, False) assert len(divided_blocks) == 3 block_indexes_0, block_size_0 = divided_blocks[0] block_indexes_1, block_size_1 = divided_blocks[1] block_indexes_2, block_size_2 = divided_blocks[2] assert sum(block_size_0) == sum(block_size_1) == sum(block_size_2) # no shuffle, pack divided_blocks = utils.divide_blocks(blocks, world_size, 0, False, True) block_indexes_0, block_size_0 = divided_blocks[0] block_indexes_1, block_size_1 = divided_blocks[1] block_indexes_2, block_size_2 = divided_blocks[2] assert _sum(block_size_0) == _sum(block_size_1) == _sum(block_size_2) # shuffle, no pack divided_blocks = utils.divide_blocks(blocks, world_size, 0, True, False) block_indexes_0, block_size_0 = divided_blocks[0] block_indexes_1, block_size_1 = divided_blocks[1] block_indexes_2, block_size_2 = divided_blocks[2] assert sum(block_size_0) == sum(block_size_1) == sum(block_size_2) # shuffle, pack divided_blocks = utils.divide_blocks(blocks, world_size, 0, True, True) block_indexes_0, block_size_0 = divided_blocks[0] block_indexes_1, block_size_1 = divided_blocks[1] block_indexes_2, block_size_2 = divided_blocks[2] assert _sum(block_size_0) == _sum(block_size_1) == _sum(block_size_2) # special case blocks = [10] divided_blocks = utils.divide_blocks(blocks, world_size, 0, False, False) block_indexes_0, block_size_0 = divided_blocks[0] block_indexes_1, block_size_1 = divided_blocks[1] block_indexes_2, block_size_2 = divided_blocks[2] assert sum(block_size_0) == sum(block_size_1) == sum(block_size_2) assert sum(block_size_0) == 4
def test_divide_blocks(): blocks = [5, 1, 2, 3, 5, 6, 2, 1, 2] world_size = 3 def get_num_records(sub_blocks): nums = 0 for index, num in sub_blocks: assert num <= blocks[index] nums += num return nums divided_blocks = utils.divide_blocks(blocks, world_size, False) assert len(divided_blocks) == 3 blocks_0 = get_num_records(divided_blocks[0]) blocks_1 = get_num_records(divided_blocks[1]) blocks_2 = get_num_records(divided_blocks[2]) assert blocks_0 == blocks_1 == blocks_2 divided_blocks = utils.divide_blocks(blocks, world_size, True) assert len(divided_blocks) == 3 blocks_0 = get_num_records(divided_blocks[0]) blocks_1 = get_num_records(divided_blocks[1]) blocks_2 = get_num_records(divided_blocks[2]) assert blocks_0 == blocks_1 == blocks_2 blocks = [5, 1, 2, 3, 5, 6, 2, 2, 2] world_size = 3 divided_blocks = utils.divide_blocks(blocks, world_size, False) assert len(divided_blocks) == 3 blocks_0 = get_num_records(divided_blocks[0]) blocks_1 = get_num_records(divided_blocks[1]) blocks_2 = get_num_records(divided_blocks[2]) assert blocks_0 == blocks_1 == blocks_2 divided_blocks = utils.divide_blocks(blocks, world_size, True) assert len(divided_blocks) == 3 blocks_0 = get_num_records(divided_blocks[0]) blocks_1 = get_num_records(divided_blocks[1]) blocks_2 = get_num_records(divided_blocks[2]) assert blocks_0 == blocks_1 == blocks_2
def test_divide_blocks(): blocks = [5, 1, 2, 3, 5, 6, 2, 1, 2] world_size = 3 divided_blocks = utils.divide_blocks(blocks, world_size) assert len(divided_blocks) == 3 blocks_0 = [blocks[i] for i in divided_blocks[0]] blocks_1 = [blocks[i] for i in divided_blocks[1]] blocks_2 = [blocks[i] for i in divided_blocks[2]] assert sum(blocks_0) == sum(blocks_1) == sum(blocks_2) blocks = [5, 1, 2, 3, 5, 6, 2, 2, 2] world_size = 3 divided_blocks = utils.divide_blocks(blocks, world_size) assert len(divided_blocks) == 3 blocks_0 = [blocks[i] for i in divided_blocks[0]] blocks_1 = [blocks[i] for i in divided_blocks[1]] blocks_2 = [blocks[i] for i in divided_blocks[2]] assert sum(blocks_1) == sum(blocks_2) assert sum(blocks_0) == (sum(blocks_1) + 1)
def save_to_ray(self, df: pyspark.sql.DataFrame, num_shards: int) -> PandasDataset: # call java function from python sql_context = df.sql_ctx jvm = sql_context.sparkSession.sparkContext._jvm jdf = df._jdf object_store_writer = jvm.org.apache.spark.sql.raydp.ObjectStoreWriter( jdf) records = object_store_writer.save() worker = ray.worker.global_worker blocks: List[ray.ObjectRef] = [] block_sizes: List[int] = [] for record in records: owner_address = record.ownerAddress() object_id = ray.ObjectID(record.objectId()) num_records = record.numRecords() # Register the ownership of the ObjectRef worker.core_worker.deserialize_and_register_object_ref( object_id.binary(), ray.ObjectRef.nil(), owner_address) blocks.append(object_id) block_sizes.append(num_records) divided_blocks = divide_blocks(block_sizes, num_shards, None, False, False) record_batch_set: List[List[RecordBatch]] = [] for i in range(num_shards): indexes, record_sizes = divided_blocks[i] object_ids = [blocks[index] for index in indexes] record_batch_set.append([RecordBatch(object_ids, record_sizes)]) # TODO: we should specify the resource spec for each shard ds = parallel_dataset.from_iterators(generators=record_batch_set, name="spark_df") def resolve_fn(it: Iterable[RecordBatch]) -> Iterator[RecordBatch]: for item in it: item.resolve() yield item return ds.transform(resolve_fn, ".RecordBatch#resolve()").flatten().to_pandas(None)