Beispiel #1
0
 def sample_block(block: Block[T]) -> np.ndarray:
     return BlockAccessor.for_block(block).sample(n_samples, key)
Beispiel #2
0
 def sort_block(block, boundaries):
     return BlockAccessor.for_block(block).sort_and_partition(
         boundaries, key, descending)
Beispiel #3
0
 def get_metadata(table: "pyarrow.Table") -> BlockMetadata:
     return BlockAccessor.for_block(table).get_metadata(input_files=None)
Beispiel #4
0
def test_json_read(ray_start_regular_shared, tmp_path):
    # Single file.
    df1 = pd.DataFrame({"one": [1, 2, 3], "two": ["a", "b", "c"]})
    path1 = os.path.join(tmp_path, "test1.json")
    df1.to_json(path1, orient="records", lines=True)
    ds = ray.experimental.data.read_json(path1)
    assert df1.equals(ray.get(ds.to_pandas())[0])
    # Test metadata ops.
    assert ds.count() == 3
    assert ds.input_files() == [path1]
    assert "{one: int64, two: string}" in str(ds), ds

    # Two files, parallelism=2.
    df2 = pd.DataFrame({"one": [4, 5, 6], "two": ["e", "f", "g"]})
    path2 = os.path.join(tmp_path, "test2.json")
    df2.to_json(path2, orient="records", lines=True)
    ds = ray.experimental.data.read_json([path1, path2], parallelism=2)
    dsdf = pd.concat(ray.get(ds.to_pandas()))
    assert pd.concat([df1, df2]).equals(dsdf)
    # Test metadata ops.
    for block, meta in zip(ds._blocks, ds._blocks.get_metadata()):
        BlockAccessor.for_block(ray.get(block)).size_bytes() == meta.size_bytes

    # Three files, parallelism=2.
    df3 = pd.DataFrame({"one": [7, 8, 9], "two": ["h", "i", "j"]})
    path3 = os.path.join(tmp_path, "test3.json")
    df3.to_json(path3, orient="records", lines=True)
    df = pd.concat([df1, df2, df3], ignore_index=True)
    ds = ray.experimental.data.read_json([path1, path2, path3], parallelism=2)
    dsdf = pd.concat(ray.get(ds.to_pandas()), ignore_index=True)
    assert df.equals(dsdf)

    # Directory, two files.
    path = os.path.join(tmp_path, "test_json_dir")
    os.mkdir(path)
    df1 = pd.DataFrame({"one": [1, 2, 3], "two": ["a", "b", "c"]})
    path1 = os.path.join(path, "data0.json")
    df1.to_json(path1, orient="records", lines=True)
    df2 = pd.DataFrame({"one": [4, 5, 6], "two": ["e", "f", "g"]})
    path2 = os.path.join(path, "data1.json")
    df2.to_json(path2, orient="records", lines=True)
    ds = ray.experimental.data.read_json(path)
    df = pd.concat([df1, df2])
    dsdf = pd.concat(ray.get(ds.to_pandas()))
    assert df.equals(dsdf)
    shutil.rmtree(path)

    # Two directories, three files.
    path1 = os.path.join(tmp_path, "test_json_dir1")
    path2 = os.path.join(tmp_path, "test_json_dir2")
    os.mkdir(path1)
    os.mkdir(path2)
    df1 = pd.DataFrame({"one": [1, 2, 3], "two": ["a", "b", "c"]})
    file_path1 = os.path.join(path1, "data0.json")
    df1.to_json(file_path1, orient="records", lines=True)
    df2 = pd.DataFrame({"one": [4, 5, 6], "two": ["e", "f", "g"]})
    file_path2 = os.path.join(path2, "data1.json")
    df2.to_json(file_path2, orient="records", lines=True)
    df3 = pd.DataFrame({"one": [7, 8, 9], "two": ["h", "i", "j"]})
    file_path3 = os.path.join(path2, "data2.json")
    df3.to_json(file_path3, orient="records", lines=True)
    ds = ray.experimental.data.read_json([path1, path2])
    df = pd.concat([df1, df2, df3])
    dsdf = pd.concat(ray.get(ds.to_pandas()))
    assert df.equals(dsdf)
    shutil.rmtree(path1)
    shutil.rmtree(path2)

    # Directory and file, two files.
    dir_path = os.path.join(tmp_path, "test_json_dir")
    os.mkdir(dir_path)
    df1 = pd.DataFrame({"one": [1, 2, 3], "two": ["a", "b", "c"]})
    path1 = os.path.join(dir_path, "data0.json")
    df1.to_json(path1, orient="records", lines=True)
    df2 = pd.DataFrame({"one": [4, 5, 6], "two": ["e", "f", "g"]})
    path2 = os.path.join(tmp_path, "data1.json")
    df2.to_json(path2, orient="records", lines=True)
    ds = ray.experimental.data.read_json([dir_path, path2])
    df = pd.concat([df1, df2])
    dsdf = pd.concat(ray.get(ds.to_pandas()))
    assert df.equals(dsdf)
    shutil.rmtree(dir_path)
Beispiel #5
0
 def write(self, block: Block) -> str:
     block = BlockAccessor.for_block(block)
     if not self.enabled:
         raise ValueError("disabled")
     self.rows_written += block.num_rows()
     return "ok"
Beispiel #6
0
 def df_to_block(df: "pandas.DataFrame") -> Block[ArrowRow]:
     block = pa.table(df)
     return (block,
             BlockAccessor.for_block(block).get_metadata(input_files=None))
Beispiel #7
0
 def agg(block: Block) -> int:
     block = BlockAccessor.for_block(block)
     return sum(block.iter_rows())
Beispiel #8
0
 def json_write(write_path: str, block: Block):
     block = BlockAccessor.for_block(block)
     logger.debug(
         f"Writing {block.num_rows()} records to {write_path}.")
     block.to_pandas().to_json(write_path, orient="records")
Beispiel #9
0
 def count(block: Block) -> int:
     block = BlockAccessor.for_block(block)
     return block.num_rows()
Beispiel #10
0
 def block_to_df(block: Block):
     block = BlockAccessor.for_block(block)
     return block.to_arrow_table()
Beispiel #11
0
 def block_to_df(block: Block):
     block = BlockAccessor.for_block(block)
     return block.to_pandas()
Beispiel #12
0
 def transform(block: Block) -> Block:
     block = BlockAccessor.for_block(block)
     builder = DelegatingArrowBlockBuilder()
     for row in block.iter_rows():
         builder.add(fn(row))
     return builder.build()
Beispiel #13
0
 def get_schema(block: Block) -> Any:
     return BlockAccessor.for_block(block).schema()