Ejemplo n.º 1
0
def test_byte_stream_split(use_legacy_dataset):
    # This is only a smoke test.
    arr_float = pa.array(list(map(float, range(100))))
    arr_int = pa.array(list(map(int, range(100))))
    data_float = [arr_float, arr_float]
    table = pa.Table.from_arrays(data_float, names=['a', 'b'])

    # Check with byte_stream_split for both columns.
    _check_roundtrip(table,
                     expected=table,
                     compression="gzip",
                     use_dictionary=False,
                     use_byte_stream_split=True)

    # Check with byte_stream_split for column 'b' and dictionary
    # for column 'a'.
    _check_roundtrip(table,
                     expected=table,
                     compression="gzip",
                     use_dictionary=['a'],
                     use_byte_stream_split=['b'])

    # Check with a collision for both columns.
    _check_roundtrip(table,
                     expected=table,
                     compression="gzip",
                     use_dictionary=['a', 'b'],
                     use_byte_stream_split=['a', 'b'])

    # Check with mixed column types.
    mixed_table = pa.Table.from_arrays([arr_float, arr_int], names=['a', 'b'])
    _check_roundtrip(mixed_table,
                     expected=mixed_table,
                     use_dictionary=['b'],
                     use_byte_stream_split=['a'])

    # Try to use the wrong data type with the byte_stream_split encoding.
    # This should throw an exception.
    table = pa.Table.from_arrays([arr_int], names=['tmp'])
    with pytest.raises(IOError):
        _check_roundtrip(table,
                         expected=table,
                         use_byte_stream_split=True,
                         use_dictionary=False,
                         use_legacy_dataset=use_legacy_dataset)
Ejemplo n.º 2
0
def test_compression_level(use_legacy_dataset):
    arr = pa.array(list(map(int, range(1000))))
    data = [arr, arr]
    table = pa.Table.from_arrays(data, names=['a', 'b'])

    # Check one compression level.
    _check_roundtrip(table,
                     expected=table,
                     compression="gzip",
                     compression_level=1,
                     use_legacy_dataset=use_legacy_dataset)

    # Check another one to make sure that compression_level=1 does not
    # coincide with the default one in Arrow.
    _check_roundtrip(table,
                     expected=table,
                     compression="gzip",
                     compression_level=5,
                     use_legacy_dataset=use_legacy_dataset)

    # Check that the user can provide a compression per column
    _check_roundtrip(table,
                     expected=table,
                     compression={
                         'a': "gzip",
                         'b': "snappy"
                     },
                     use_legacy_dataset=use_legacy_dataset)

    # Check that the user can provide a compression level per column
    _check_roundtrip(table,
                     expected=table,
                     compression="gzip",
                     compression_level={
                         'a': 2,
                         'b': 3
                     },
                     use_legacy_dataset=use_legacy_dataset)

    # Check that specifying a compression level for a codec which does allow
    # specifying one, results into an error.
    # Uncompressed, snappy, lz4 and lzo do not support specifying a compression
    # level.
    # GZIP (zlib) allows for specifying a compression level but as of up
    # to version 1.2.11 the valid range is [-1, 9].
    invalid_combinations = [("snappy", 4), ("lz4", 5), ("gzip", -1337),
                            ("None", 444), ("lzo", 14)]
    buf = io.BytesIO()
    for (codec, level) in invalid_combinations:
        with pytest.raises((ValueError, OSError)):
            _write_table(table,
                         buf,
                         compression=codec,
                         compression_level=level)
Ejemplo n.º 3
0
def test_writing_empty_lists():
    # ARROW-2591: [Python] Segmentation fault issue in pq.write_table
    arr1 = pa.array([[], []], pa.list_(pa.int32()))
    table = pa.Table.from_arrays([arr1], ['list(int32)'])
    _check_roundtrip(table)
Ejemplo n.º 4
0
def test_empty_lists_table_roundtrip(use_legacy_dataset):
    # ARROW-2744: Shouldn't crash when writing an array of empty lists
    arr = pa.array([[], []], type=pa.list_(pa.int32()))
    table = pa.Table.from_arrays([arr], ["A"])
    _check_roundtrip(table, use_legacy_dataset=use_legacy_dataset)
Ejemplo n.º 5
0
def test_parquet_version_timestamp_differences():
    i_s = pd.Timestamp('2010-01-01').value / 1000000000  # := 1262304000

    d_s = np.arange(i_s, i_s + 10, 1, dtype='int64')
    d_ms = d_s * 1000
    d_us = d_ms * 1000
    d_ns = d_us * 1000

    a_s = pa.array(d_s, type=pa.timestamp('s'))
    a_ms = pa.array(d_ms, type=pa.timestamp('ms'))
    a_us = pa.array(d_us, type=pa.timestamp('us'))
    a_ns = pa.array(d_ns, type=pa.timestamp('ns'))

    names = ['ts:s', 'ts:ms', 'ts:us', 'ts:ns']
    table = pa.Table.from_arrays([a_s, a_ms, a_us, a_ns], names)

    # Using Parquet version 1.0, seconds should be coerced to milliseconds
    # and nanoseconds should be coerced to microseconds by default
    expected = pa.Table.from_arrays([a_ms, a_ms, a_us, a_us], names)
    _check_roundtrip(table, expected)

    # Using Parquet version 2.0, seconds should be coerced to milliseconds
    # and nanoseconds should be retained by default
    expected = pa.Table.from_arrays([a_ms, a_ms, a_us, a_ns], names)
    _check_roundtrip(table, expected, version='2.6')

    # Using Parquet version 1.0, coercing to milliseconds or microseconds
    # is allowed
    expected = pa.Table.from_arrays([a_ms, a_ms, a_ms, a_ms], names)
    _check_roundtrip(table, expected, coerce_timestamps='ms')

    # Using Parquet version 2.0, coercing to milliseconds or microseconds
    # is allowed
    expected = pa.Table.from_arrays([a_us, a_us, a_us, a_us], names)
    _check_roundtrip(table, expected, version='2.6', coerce_timestamps='us')

    # TODO: after pyarrow allows coerce_timestamps='ns', tests like the
    # following should pass ...

    # Using Parquet version 1.0, coercing to nanoseconds is not allowed
    # expected = None
    # with pytest.raises(NotImplementedError):
    #     _roundtrip_table(table, coerce_timestamps='ns')

    # Using Parquet version 2.0, coercing to nanoseconds is allowed
    # expected = pa.Table.from_arrays([a_ns, a_ns, a_ns, a_ns], names)
    # _check_roundtrip(table, expected, version='2.6', coerce_timestamps='ns')

    # For either Parquet version, coercing to nanoseconds is allowed
    # if Int96 storage is used
    expected = pa.Table.from_arrays([a_ns, a_ns, a_ns, a_ns], names)
    _check_roundtrip(table, expected, use_deprecated_int96_timestamps=True)
    _check_roundtrip(table,
                     expected,
                     version='2.6',
                     use_deprecated_int96_timestamps=True)
Ejemplo n.º 6
0
def test_timestamp_restore_timezone():
    # ARROW-5888, restore timezone from serialized metadata
    ty = pa.timestamp('ms', tz='America/New_York')
    arr = pa.array([1, 2, 3], type=ty)
    t = pa.table([arr], names=['f0'])
    _check_roundtrip(t)
Ejemplo n.º 7
0
def test_date_time_types(tempdir):
    t1 = pa.date32()
    data1 = np.array([17259, 17260, 17261], dtype='int32')
    a1 = pa.array(data1, type=t1)

    t2 = pa.date64()
    data2 = data1.astype('int64') * 86400000
    a2 = pa.array(data2, type=t2)

    t3 = pa.timestamp('us')
    start = pd.Timestamp('2001-01-01').value / 1000
    data3 = np.array([start, start + 1, start + 2], dtype='int64')
    a3 = pa.array(data3, type=t3)

    t4 = pa.time32('ms')
    data4 = np.arange(3, dtype='i4')
    a4 = pa.array(data4, type=t4)

    t5 = pa.time64('us')
    a5 = pa.array(data4.astype('int64'), type=t5)

    t6 = pa.time32('s')
    a6 = pa.array(data4, type=t6)

    ex_t6 = pa.time32('ms')
    ex_a6 = pa.array(data4 * 1000, type=ex_t6)

    t7 = pa.timestamp('ns')
    start = pd.Timestamp('2001-01-01').value
    data7 = np.array([start, start + 1000, start + 2000], dtype='int64')
    a7 = pa.array(data7, type=t7)

    table = pa.Table.from_arrays([a1, a2, a3, a4, a5, a6, a7], [
        'date32', 'date64', 'timestamp[us]', 'time32[s]', 'time64[us]',
        'time32_from64[s]', 'timestamp[ns]'
    ])

    # date64 as date32
    # time32[s] to time32[ms]
    expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6, a7], [
        'date32', 'date64', 'timestamp[us]', 'time32[s]', 'time64[us]',
        'time32_from64[s]', 'timestamp[ns]'
    ])

    _check_roundtrip(table, expected=expected, version='2.6')

    t0 = pa.timestamp('ms')
    data0 = np.arange(4, dtype='int64')
    a0 = pa.array(data0, type=t0)

    t1 = pa.timestamp('us')
    data1 = np.arange(4, dtype='int64')
    a1 = pa.array(data1, type=t1)

    t2 = pa.timestamp('ns')
    data2 = np.arange(4, dtype='int64')
    a2 = pa.array(data2, type=t2)

    table = pa.Table.from_arrays([a0, a1, a2], ['ts[ms]', 'ts[us]', 'ts[ns]'])
    expected = pa.Table.from_arrays([a0, a1, a2],
                                    ['ts[ms]', 'ts[us]', 'ts[ns]'])

    # int64 for all timestamps supported by default
    filename = tempdir / 'int64_timestamps.parquet'
    _write_table(table, filename, version='2.6')
    parquet_schema = pq.ParquetFile(filename).schema
    for i in range(3):
        assert parquet_schema.column(i).physical_type == 'INT64'
    read_table = _read_table(filename)
    assert read_table.equals(expected)

    t0_ns = pa.timestamp('ns')
    data0_ns = np.array(data0 * 1000000, dtype='int64')
    a0_ns = pa.array(data0_ns, type=t0_ns)

    t1_ns = pa.timestamp('ns')
    data1_ns = np.array(data1 * 1000, dtype='int64')
    a1_ns = pa.array(data1_ns, type=t1_ns)

    expected = pa.Table.from_arrays([a0_ns, a1_ns, a2],
                                    ['ts[ms]', 'ts[us]', 'ts[ns]'])

    # int96 nanosecond timestamps produced upon request
    filename = tempdir / 'explicit_int96_timestamps.parquet'
    _write_table(table,
                 filename,
                 version='2.6',
                 use_deprecated_int96_timestamps=True)
    parquet_schema = pq.ParquetFile(filename).schema
    for i in range(3):
        assert parquet_schema.column(i).physical_type == 'INT96'
    read_table = _read_table(filename)
    assert read_table.equals(expected)

    # int96 nanosecond timestamps implied by flavor 'spark'
    filename = tempdir / 'spark_int96_timestamps.parquet'
    _write_table(table, filename, version='2.6', flavor='spark')
    parquet_schema = pq.ParquetFile(filename).schema
    for i in range(3):
        assert parquet_schema.column(i).physical_type == 'INT96'
    read_table = _read_table(filename)
    assert read_table.equals(expected)
Ejemplo n.º 8
0
def test_column_encoding(use_legacy_dataset):
    arr_float = pa.array(list(map(float, range(100))))
    arr_int = pa.array(list(map(int, range(100))))
    mixed_table = pa.Table.from_arrays([arr_float, arr_int], names=['a', 'b'])

    # Check "BYTE_STREAM_SPLIT" for column 'a' and "PLAIN" column_encoding for
    # column 'b'.
    _check_roundtrip(mixed_table,
                     expected=mixed_table,
                     use_dictionary=False,
                     column_encoding={
                         'a': "BYTE_STREAM_SPLIT",
                         'b': "PLAIN"
                     },
                     use_legacy_dataset=use_legacy_dataset)

    # Check "PLAIN" for all columns.
    _check_roundtrip(mixed_table,
                     expected=mixed_table,
                     use_dictionary=False,
                     column_encoding="PLAIN",
                     use_legacy_dataset=use_legacy_dataset)

    # Try to pass "BYTE_STREAM_SPLIT" column encoding for integer column 'b'.
    # This should throw an error as it is only supports FLOAT and DOUBLE.
    with pytest.raises(IOError,
                       match="BYTE_STREAM_SPLIT only supports FLOAT and"
                       " DOUBLE"):
        _check_roundtrip(mixed_table,
                         expected=mixed_table,
                         use_dictionary=False,
                         column_encoding={'b': "BYTE_STREAM_SPLIT"},
                         use_legacy_dataset=use_legacy_dataset)

    # Try to pass "DELTA_BINARY_PACKED".
    # This should throw an error as it is only supported for reading.
    with pytest.raises(IOError,
                       match="Not yet implemented: Selected encoding is"
                       " not supported."):
        _check_roundtrip(mixed_table,
                         expected=mixed_table,
                         use_dictionary=False,
                         column_encoding={'b': "DELTA_BINARY_PACKED"},
                         use_legacy_dataset=use_legacy_dataset)

    # Try to pass "RLE_DICTIONARY".
    # This should throw an error as dictionary encoding is already used by
    # default and not supported to be specified as "fallback" encoding
    with pytest.raises(ValueError):
        _check_roundtrip(mixed_table,
                         expected=mixed_table,
                         use_dictionary=False,
                         column_encoding="RLE_DICTIONARY",
                         use_legacy_dataset=use_legacy_dataset)

    # Try to pass unsupported encoding.
    with pytest.raises(ValueError):
        _check_roundtrip(mixed_table,
                         expected=mixed_table,
                         use_dictionary=False,
                         column_encoding={'a': "MADE_UP_ENCODING"},
                         use_legacy_dataset=use_legacy_dataset)

    # Try to pass column_encoding and use_dictionary.
    # This should throw an error.
    with pytest.raises(ValueError):
        _check_roundtrip(mixed_table,
                         expected=mixed_table,
                         use_dictionary=['b'],
                         column_encoding={'b': "PLAIN"},
                         use_legacy_dataset=use_legacy_dataset)

    # Try to pass column_encoding and use_dictionary=True (default value).
    # This should throw an error.
    with pytest.raises(ValueError):
        _check_roundtrip(mixed_table,
                         expected=mixed_table,
                         column_encoding={'b': "PLAIN"},
                         use_legacy_dataset=use_legacy_dataset)

    # Try to pass column_encoding and use_byte_stream_split on same column.
    # This should throw an error.
    with pytest.raises(ValueError):
        _check_roundtrip(mixed_table,
                         expected=mixed_table,
                         use_dictionary=False,
                         use_byte_stream_split=['a'],
                         column_encoding={
                             'a': "RLE",
                             'b': "BYTE_STREAM_SPLIT"
                         },
                         use_legacy_dataset=use_legacy_dataset)

    # Try to pass column_encoding and use_byte_stream_split=True.
    # This should throw an error.
    with pytest.raises(ValueError):
        _check_roundtrip(mixed_table,
                         expected=mixed_table,
                         use_dictionary=False,
                         use_byte_stream_split=True,
                         column_encoding={
                             'a': "RLE",
                             'b': "BYTE_STREAM_SPLIT"
                         },
                         use_legacy_dataset=use_legacy_dataset)

    # Try to pass column_encoding=True.
    # This should throw an error.
    with pytest.raises(TypeError):
        _check_roundtrip(mixed_table,
                         expected=mixed_table,
                         use_dictionary=False,
                         column_encoding=True,
                         use_legacy_dataset=use_legacy_dataset)
Ejemplo n.º 9
0
def test_empty_table_no_columns(use_legacy_dataset):
    df = pd.DataFrame()
    empty = pa.Table.from_pandas(df, preserve_index=False)
    _check_roundtrip(empty, use_legacy_dataset=use_legacy_dataset)