Beispiel #1
0
def test_fails_wrong_partitioning(existing_table: DeltaTable,
                                  sample_data: pa.Table):
    with pytest.raises(AssertionError):
        write_deltalake(existing_table,
                        sample_data,
                        mode="append",
                        partition_by="int32")
Beispiel #2
0
def test_writer_with_max_rows(tmp_path: pathlib.Path, row_count: int,
                              rows_per_file: int, expected_files: int):
    def get_multifile_stats(table: DeltaTable) -> Iterable[Dict]:
        log_path = get_log_path(table)

        # Should only have single add entry
        for line in open(log_path, "r").readlines():
            log_entry = json.loads(line)

            if "add" in log_entry:
                yield json.loads(log_entry["add"]["stats"])

    data = pa.table({
        "colA":
        pa.array(range(0, row_count), pa.int32()),
        "colB":
        pa.array([i * random.random() for i in range(0, row_count)],
                 pa.float64()),
    })
    path = str(tmp_path)
    write_deltalake(
        path,
        data,
        file_options=ParquetFileFormat().make_write_options(),
        max_rows_per_file=rows_per_file,
        max_rows_per_group=rows_per_file,
    )

    table = DeltaTable(path)
    stats = get_multifile_stats(table)
    files_written = [f for f in os.listdir(path) if f != "_delta_log"]

    assert sum([stat_entry["numRecords"] for stat_entry in stats]) == row_count
    assert len(files_written) == expected_files
Beispiel #3
0
def test_write_recordbatchreader(tmp_path: pathlib.Path,
                                 existing_table: DeltaTable,
                                 sample_data: pa.Table):
    batches = existing_table.to_pyarrow_dataset().to_batches()
    reader = RecordBatchReader.from_batches(sample_data.schema, batches)

    write_deltalake(str(tmp_path), reader, mode="overwrite")
    assert DeltaTable(str(tmp_path)).to_pyarrow_table() == sample_data
Beispiel #4
0
def test_write_pandas(tmp_path: pathlib.Path, sample_data: pa.Table):
    # When timestamp is converted to Pandas, it gets casted to ns resolution,
    # but Delta Lake schemas only support us resolution.
    sample_pandas = sample_data.to_pandas().drop(["timestamp"], axis=1)
    write_deltalake(str(tmp_path), sample_pandas)

    delta_table = DeltaTable(str(tmp_path))
    df = delta_table.to_pandas()
    assert_frame_equal(df, sample_pandas)
Beispiel #5
0
def test_handle_existing(tmp_path: pathlib.Path, sample_data: pa.Table):
    # if uri points to a non-empty directory that isn't a delta table, error
    tmp_path
    p = tmp_path / "hello.txt"
    p.write_text("hello")

    with pytest.raises(OSError) as exception:
        write_deltalake(str(tmp_path), sample_data, mode="overwrite")

    assert "directory is not empty" in str(exception)
Beispiel #6
0
def test_writer_partitioning(tmp_path: pathlib.Path):
    test_strings = ["a=b", "hello world", "hello%20world"]
    data = pa.table({
        "p": pa.array(test_strings),
        "x": pa.array(range(len(test_strings)))
    })

    write_deltalake(str(tmp_path), data)

    assert DeltaTable(str(tmp_path)).to_pyarrow_table() == data
Beispiel #7
0
def test_roundtrip_multi_partitioned(tmp_path: pathlib.Path,
                                     sample_data: pa.Table):
    write_deltalake(str(tmp_path), sample_data, partition_by=["int32", "bool"])

    delta_table = DeltaTable(str(tmp_path))
    assert delta_table.pyarrow_schema() == sample_data.schema

    table = delta_table.to_pyarrow_table()
    table = table.take(pc.sort_indices(table["int64"]))
    assert table == sample_data
Beispiel #8
0
def test_roundtrip_basic(tmp_path: pathlib.Path, sample_data: pa.Table):
    write_deltalake(str(tmp_path), sample_data)

    assert ("0" * 20 + ".json") in os.listdir(tmp_path / "_delta_log")

    delta_table = DeltaTable(str(tmp_path))
    assert delta_table.pyarrow_schema() == sample_data.schema

    table = delta_table.to_pyarrow_table()
    assert table == sample_data
Beispiel #9
0
def test_write_iterator(tmp_path: pathlib.Path, existing_table: DeltaTable,
                        sample_data: pa.Table):
    batches = existing_table.to_pyarrow_dataset().to_batches()
    with pytest.raises(ValueError):
        write_deltalake(str(tmp_path), batches, mode="overwrite")

    write_deltalake(str(tmp_path),
                    batches,
                    schema=sample_data.schema,
                    mode="overwrite")
    assert DeltaTable(str(tmp_path)).to_pyarrow_table() == sample_data
Beispiel #10
0
def test_writer_null_stats(tmp_path: pathlib.Path):
    data = pa.table({
        "int32": pa.array([1, None, 2, None], pa.int32()),
        "float64": pa.array([1.0, None, None, None], pa.float64()),
        "str": pa.array([None] * 4, pa.string()),
    })
    path = str(tmp_path)
    write_deltalake(path, data)

    table = DeltaTable(path)
    stats = get_stats(table)

    expected_nulls = {"int32": 2, "float64": 3, "str": 4}
    assert stats["nullCount"] == expected_nulls
Beispiel #11
0
def test_writer_with_options(tmp_path: pathlib.Path):
    column_values = [
        datetime(year_, 1, 1, 0, 0, 0) for year_ in range(9000, 9010)
    ]
    data = pa.table({"colA": pa.array(column_values, pa.timestamp("us"))})
    path = str(tmp_path)
    opts = (ParquetFileFormat().make_write_options().update(
        compression="GZIP", coerce_timestamps="us"))
    write_deltalake(path, data, file_options=opts)

    table = (DeltaTable(path).to_pyarrow_dataset(
        parquet_read_options=ParquetReadOptions(
            coerce_int96_timestamp_unit="us")).to_table())

    assert table == data
Beispiel #12
0
def test_roundtrip_metadata(tmp_path: pathlib.Path, sample_data: pa.Table):
    write_deltalake(
        str(tmp_path),
        sample_data,
        name="test_name",
        description="test_desc",
        configuration={"configTest": "foobar"},
    )

    delta_table = DeltaTable(str(tmp_path))

    metadata = delta_table.metadata()

    assert metadata.name == "test_name"
    assert metadata.description == "test_desc"
    assert metadata.configuration == {"configTest": "foobar"}
Beispiel #13
0
def test_write_modes(tmp_path: pathlib.Path, sample_data: pa.Table):
    path = str(tmp_path)

    write_deltalake(path, sample_data)
    assert DeltaTable(path).to_pyarrow_table() == sample_data

    with pytest.raises(AssertionError):
        write_deltalake(path, sample_data, mode="error")

    write_deltalake(path, sample_data, mode="ignore")
    assert ("0" * 19 + "1.json") not in os.listdir(tmp_path / "_delta_log")

    write_deltalake(path, sample_data, mode="append")
    expected = pa.concat_tables([sample_data, sample_data])
    assert DeltaTable(path).to_pyarrow_table() == expected

    write_deltalake(path, sample_data, mode="overwrite")
    assert DeltaTable(path).to_pyarrow_table() == sample_data
Beispiel #14
0
def existing_table(tmp_path: pathlib.Path, sample_data: pa.Table):
    path = str(tmp_path)
    write_deltalake(path, sample_data)
    return DeltaTable(path)
Beispiel #15
0
def test_writer_fails_on_protocol(existing_table: DeltaTable,
                                  sample_data: pa.Table):
    existing_table.protocol = Mock(return_value=ProtocolVersions(1, 2))
    with pytest.raises(DeltaTableProtocolError):
        write_deltalake(existing_table, sample_data, mode="overwrite")
Beispiel #16
0
def test_writer_with_table(existing_table: DeltaTable, sample_data: pa.Table):
    write_deltalake(existing_table, sample_data, mode="overwrite")
    existing_table.update_incremental()
    assert existing_table.to_pyarrow_table() == sample_data