Exemple #1
0
def _create_ml_dataset(name: str,
                       record_pieces: List[RecordPiece],
                       record_sizes: List[int],
                       num_shards: int,
                       shuffle: bool,
                       shuffle_seed: int,
                       RecordBatchCls,
                       node_hints: List[str] = None) -> MLDataset:
    if node_hints is not None:
        assert num_shards % len(node_hints) == 0,\
            f"num_shards: {num_shards} should be a multiple of length of node_hints: {node_hints}"
    if shuffle_seed:
        np.random.seed(shuffle_seed)
    else:
        np.random.seed(0)

    # split the piece into num_shards partitions
    divided_blocks = divide_blocks(blocks=record_sizes,
                                   world_size=num_shards,
                                   shuffle=shuffle,
                                   shuffle_seed=shuffle_seed)

    record_batches = []

    for rank, blocks in divided_blocks.items():
        pieces = []
        for index, num_samples in blocks:
            record_size = record_sizes[index]
            piece = record_pieces[index]
            if num_samples != record_size:
                assert num_samples < record_size
                new_row_ids = np.random.choice(
                    record_size, size=num_samples).tolist()
                piece = piece.with_row_ids(new_row_ids)
            pieces.append(piece)

        if shuffle:
            np.random.shuffle(pieces)
        record_batches.append(RecordBatchCls(shard_id=rank,
                                             prefix=name,
                                             record_pieces=pieces,
                                             shuffle=shuffle,
                                             shuffle_seed=shuffle_seed))

    worker_cls = ray.remote(ParallelIteratorWorkerWithLen)
    if node_hints is not None:
        actors = []
        multiplier = num_shards // len(node_hints)
        resource_keys = [f"node:{node_hints[i // multiplier]}" for i in range(num_shards)]
        for g, resource_key in zip(record_batches, resource_keys):
            actor = worker_cls.options(resources={resource_key: 0.01}).remote(g, False, len(g))
            actors.append(actor)
    else:
        worker_cls = ray.remote(ParallelIteratorWorkerWithLen)
        actors = [worker_cls.remote(g, False, len(g)) for g in record_batches]

    it = parallel_it.from_actors(actors, name)
    ds = ml_dataset.from_parallel_iter(
        it, need_convert=False, batch_size=0, repeated=False)
    return ds
Exemple #2
0
def _save_spark_df_to_object_store(df: sql.DataFrame,
                                   num_shards: int) -> List["RecordBatchShard"]:
    # call java function from python
    jvm = df.sql_ctx.sparkSession.sparkContext._jvm
    jdf = df._jdf
    object_store_writer = jvm.org.apache.spark.sql.raydp.ObjectStoreWriter(jdf)
    records = object_store_writer.save()

    worker = ray.worker.global_worker

    blocks: List[ray.ObjectRef] = []
    block_sizes: List[int] = []
    for record in records:
        owner_address = record.ownerAddress()
        object_ref = ray.ObjectRef(record.objectId())
        num_records = record.numRecords()
        # Register the ownership of the ObjectRef
        worker.core_worker.deserialize_and_register_object_ref(
            object_ref.binary(), ray.ObjectRef.nil(), owner_address)

        blocks.append(object_ref)
        block_sizes.append(num_records)

    divided_blocks = divide_blocks(block_sizes, num_shards)
    record_batch_set: List[RecordBatchShard] = []
    for i in range(num_shards):
        indexes = divided_blocks[i]
        object_ids = [blocks[index] for index in indexes]
        record_batch_set.append(RecordBatchShard(i, object_ids))
    return record_batch_set
Exemple #3
0
def test_divide_blocks():
    blocks = [5, 10, 9]
    world_size = 3

    def _sum(packed_indexes) -> int:
        return sum([len(i) for i in packed_indexes])

    # no shuffle, no pack
    divided_blocks = utils.divide_blocks(blocks, world_size, None, False,
                                         False)
    assert len(divided_blocks) == 3
    block_indexes_0, block_size_0 = divided_blocks[0]
    block_indexes_1, block_size_1 = divided_blocks[1]
    block_indexes_2, block_size_2 = divided_blocks[2]
    assert sum(block_size_0) == sum(block_size_1) == sum(block_size_2)

    # no shuffle, pack
    divided_blocks = utils.divide_blocks(blocks, world_size, 0, False, True)
    block_indexes_0, block_size_0 = divided_blocks[0]
    block_indexes_1, block_size_1 = divided_blocks[1]
    block_indexes_2, block_size_2 = divided_blocks[2]
    assert _sum(block_size_0) == _sum(block_size_1) == _sum(block_size_2)

    # shuffle, no pack
    divided_blocks = utils.divide_blocks(blocks, world_size, 0, True, False)
    block_indexes_0, block_size_0 = divided_blocks[0]
    block_indexes_1, block_size_1 = divided_blocks[1]
    block_indexes_2, block_size_2 = divided_blocks[2]
    assert sum(block_size_0) == sum(block_size_1) == sum(block_size_2)

    # shuffle, pack
    divided_blocks = utils.divide_blocks(blocks, world_size, 0, True, True)
    block_indexes_0, block_size_0 = divided_blocks[0]
    block_indexes_1, block_size_1 = divided_blocks[1]
    block_indexes_2, block_size_2 = divided_blocks[2]
    assert _sum(block_size_0) == _sum(block_size_1) == _sum(block_size_2)

    # special case
    blocks = [10]
    divided_blocks = utils.divide_blocks(blocks, world_size, 0, False, False)
    block_indexes_0, block_size_0 = divided_blocks[0]
    block_indexes_1, block_size_1 = divided_blocks[1]
    block_indexes_2, block_size_2 = divided_blocks[2]
    assert sum(block_size_0) == sum(block_size_1) == sum(block_size_2)
    assert sum(block_size_0) == 4
def test_divide_blocks():
    blocks = [5, 1, 2, 3, 5, 6, 2, 1, 2]
    world_size = 3

    def get_num_records(sub_blocks):
        nums = 0
        for index, num in sub_blocks:
            assert num <= blocks[index]
            nums += num
        return nums

    divided_blocks = utils.divide_blocks(blocks, world_size, False)
    assert len(divided_blocks) == 3

    blocks_0 = get_num_records(divided_blocks[0])
    blocks_1 = get_num_records(divided_blocks[1])
    blocks_2 = get_num_records(divided_blocks[2])
    assert blocks_0 == blocks_1 == blocks_2

    divided_blocks = utils.divide_blocks(blocks, world_size, True)
    assert len(divided_blocks) == 3

    blocks_0 = get_num_records(divided_blocks[0])
    blocks_1 = get_num_records(divided_blocks[1])
    blocks_2 = get_num_records(divided_blocks[2])
    assert blocks_0 == blocks_1 == blocks_2

    blocks = [5, 1, 2, 3, 5, 6, 2, 2, 2]
    world_size = 3

    divided_blocks = utils.divide_blocks(blocks, world_size, False)
    assert len(divided_blocks) == 3

    blocks_0 = get_num_records(divided_blocks[0])
    blocks_1 = get_num_records(divided_blocks[1])
    blocks_2 = get_num_records(divided_blocks[2])
    assert blocks_0 == blocks_1 == blocks_2

    divided_blocks = utils.divide_blocks(blocks, world_size, True)
    assert len(divided_blocks) == 3

    blocks_0 = get_num_records(divided_blocks[0])
    blocks_1 = get_num_records(divided_blocks[1])
    blocks_2 = get_num_records(divided_blocks[2])
    assert blocks_0 == blocks_1 == blocks_2
Exemple #5
0
def test_divide_blocks():
    blocks = [5, 1, 2, 3, 5, 6, 2, 1, 2]
    world_size = 3

    divided_blocks = utils.divide_blocks(blocks, world_size)
    assert len(divided_blocks) == 3
    blocks_0 = [blocks[i] for i in divided_blocks[0]]
    blocks_1 = [blocks[i] for i in divided_blocks[1]]
    blocks_2 = [blocks[i] for i in divided_blocks[2]]
    assert sum(blocks_0) == sum(blocks_1) == sum(blocks_2)

    blocks = [5, 1, 2, 3, 5, 6, 2, 2, 2]
    world_size = 3

    divided_blocks = utils.divide_blocks(blocks, world_size)
    assert len(divided_blocks) == 3
    blocks_0 = [blocks[i] for i in divided_blocks[0]]
    blocks_1 = [blocks[i] for i in divided_blocks[1]]
    blocks_2 = [blocks[i] for i in divided_blocks[2]]
    assert sum(blocks_1) == sum(blocks_2)
    assert sum(blocks_0) == (sum(blocks_1) + 1)
    def save_to_ray(self, df: pyspark.sql.DataFrame,
                    num_shards: int) -> PandasDataset:
        # call java function from python
        sql_context = df.sql_ctx
        jvm = sql_context.sparkSession.sparkContext._jvm
        jdf = df._jdf
        object_store_writer = jvm.org.apache.spark.sql.raydp.ObjectStoreWriter(
            jdf)
        records = object_store_writer.save()

        worker = ray.worker.global_worker

        blocks: List[ray.ObjectRef] = []
        block_sizes: List[int] = []
        for record in records:
            owner_address = record.ownerAddress()
            object_id = ray.ObjectID(record.objectId())
            num_records = record.numRecords()
            # Register the ownership of the ObjectRef
            worker.core_worker.deserialize_and_register_object_ref(
                object_id.binary(), ray.ObjectRef.nil(), owner_address)

            blocks.append(object_id)
            block_sizes.append(num_records)

        divided_blocks = divide_blocks(block_sizes, num_shards, None, False,
                                       False)
        record_batch_set: List[List[RecordBatch]] = []
        for i in range(num_shards):
            indexes, record_sizes = divided_blocks[i]
            object_ids = [blocks[index] for index in indexes]
            record_batch_set.append([RecordBatch(object_ids, record_sizes)])

        # TODO: we should specify the resource spec for each shard
        ds = parallel_dataset.from_iterators(generators=record_batch_set,
                                             name="spark_df")

        def resolve_fn(it: Iterable[RecordBatch]) -> Iterator[RecordBatch]:
            for item in it:
                item.resolve()
                yield item

        return ds.transform(resolve_fn,
                            ".RecordBatch#resolve()").flatten().to_pandas(None)