Ejemplo n.º 1
0
def test_coerce_timestamps_truncated(tempdir):
    """
    ARROW-2555: Test that we can truncate timestamps when coercing if
    explicitly allowed.
    """
    dt_us = datetime.datetime(year=2017, month=1, day=1, hour=1, minute=1,
                              second=1, microsecond=1)
    dt_ms = datetime.datetime(year=2017, month=1, day=1, hour=1, minute=1,
                              second=1)

    fields_us = [pa.field('datetime64', pa.timestamp('us'))]
    arrays_us = {'datetime64': [dt_us, dt_ms]}

    df_us = pd.DataFrame(arrays_us)
    schema_us = pa.schema(fields_us)

    filename = tempdir / 'pandas_truncated.parquet'
    table_us = pa.Table.from_pandas(df_us, schema=schema_us)

    _write_table(table_us, filename, version="2.0", coerce_timestamps='ms',
                 allow_truncated_timestamps=True)
    table_ms = _read_table(filename)
    df_ms = table_ms.to_pandas()

    arrays_expected = {'datetime64': [dt_ms, dt_ms]}
    df_expected = pd.DataFrame(arrays_expected)
    tm.assert_frame_equal(df_expected, df_ms)
Ejemplo n.º 2
0
def test_parquet_1_0_roundtrip(tempdir, use_legacy_dataset):
    size = 10000
    np.random.seed(0)
    df = pd.DataFrame({
        'uint8': np.arange(size, dtype=np.uint8),
        'uint16': np.arange(size, dtype=np.uint16),
        'uint32': np.arange(size, dtype=np.uint32),
        'uint64': np.arange(size, dtype=np.uint64),
        'int8': np.arange(size, dtype=np.int16),
        'int16': np.arange(size, dtype=np.int16),
        'int32': np.arange(size, dtype=np.int32),
        'int64': np.arange(size, dtype=np.int64),
        'float32': np.arange(size, dtype=np.float32),
        'float64': np.arange(size, dtype=np.float64),
        'bool': np.random.randn(size) > 0,
        'str': [str(x) for x in range(size)],
        'str_with_nulls': [None] + [str(x) for x in range(size - 2)] + [None],
        'empty_str': [''] * size
    })
    filename = tempdir / 'pandas_roundtrip.parquet'
    arrow_table = pa.Table.from_pandas(df)
    _write_table(arrow_table, filename, version='1.0')
    table_read = _read_table(filename, use_legacy_dataset=use_legacy_dataset)
    df_read = table_read.to_pandas()

    # We pass uint32_t as int64_t if we write Parquet version 1.0
    df['uint32'] = df['uint32'].values.astype(np.int64)

    tm.assert_frame_equal(df, df_read)
Ejemplo n.º 3
0
def test_pandas_parquet_2_0_roundtrip_read_pandas_no_index_written(
    tempdir, use_legacy_dataset
):
    df = alltypes_sample(size=10000)

    filename = tempdir / 'pandas_roundtrip.parquet'
    arrow_table = pa.Table.from_pandas(df, preserve_index=False)
    js = arrow_table.schema.pandas_metadata
    assert not js['index_columns']
    # ARROW-2170
    # While index_columns should be empty, columns needs to be filled still.
    assert js['columns']

    _write_table(arrow_table, filename, version='2.0', coerce_timestamps='ms')
    table_read = pq.read_pandas(
        filename, use_legacy_dataset=use_legacy_dataset)

    js = table_read.schema.pandas_metadata
    assert not js['index_columns']

    read_metadata = table_read.schema.metadata
    assert arrow_table.schema.metadata == read_metadata

    df_read = table_read.to_pandas()
    tm.assert_frame_equal(df, df_read)
Ejemplo n.º 4
0
def test_read_single_row_group_with_column_subset():
    N, K = 10000, 4
    df = alltypes_sample(size=N)
    a_table = pa.Table.from_pandas(df)

    buf = io.BytesIO()
    _write_table(a_table,
                 buf,
                 row_group_size=N / K,
                 compression='snappy',
                 version='2.0')

    buf.seek(0)
    pf = pq.ParquetFile(buf)

    cols = list(df.columns[:2])
    row_groups = [pf.read_row_group(i, columns=cols) for i in range(K)]
    result = pa.concat_tables(row_groups)
    tm.assert_frame_equal(df[cols], result.to_pandas())

    # ARROW-4267: Selection of duplicate columns still leads to these columns
    # being read uniquely.
    row_groups = [pf.read_row_group(i, columns=cols + cols) for i in range(K)]
    result = pa.concat_tables(row_groups)
    tm.assert_frame_equal(df[cols], result.to_pandas())
Ejemplo n.º 5
0
def test_pandas_column_selection(tempdir, use_legacy_dataset):
    size = 10000
    np.random.seed(0)
    df = pd.DataFrame({
        'uint8': np.arange(size, dtype=np.uint8),
        'uint16': np.arange(size, dtype=np.uint16)
    })
    filename = tempdir / 'pandas_roundtrip.parquet'
    arrow_table = pa.Table.from_pandas(df)
    _write_table(arrow_table, filename)
    table_read = _read_table(filename,
                             columns=['uint8'],
                             use_legacy_dataset=use_legacy_dataset)
    df_read = table_read.to_pandas()

    tm.assert_frame_equal(df[['uint8']], df_read)

    # ARROW-4267: Selection of duplicate columns still leads to these columns
    # being read uniquely.
    table_read = _read_table(filename,
                             columns=['uint8', 'uint8'],
                             use_legacy_dataset=use_legacy_dataset)
    df_read = table_read.to_pandas()

    tm.assert_frame_equal(df[['uint8']], df_read)
Ejemplo n.º 6
0
def test_write_error_deletes_incomplete_file(tempdir):
    # ARROW-1285
    df = pd.DataFrame({
        'a':
        list('abc'),
        'b':
        list(range(1, 4)),
        'c':
        np.arange(3, 6).astype('u1'),
        'd':
        np.arange(4.0, 7.0, dtype='float64'),
        'e': [True, False, True],
        'f':
        pd.Categorical(list('abc')),
        'g':
        pd.date_range('20130101', periods=3),
        'h':
        pd.date_range('20130101', periods=3, tz='US/Eastern'),
        'i':
        pd.date_range('20130101', periods=3, freq='ns')
    })

    pdf = pa.Table.from_pandas(df)

    filename = tempdir / 'tmp_file'
    try:
        _write_table(pdf, filename)
    except pa.ArrowException:
        pass

    assert not filename.exists()
Ejemplo n.º 7
0
def test_parquet_invalid_version(tempdir):
    table = pa.table({'a': [1, 2, 3]})
    with pytest.raises(ValueError, match="Unsupported Parquet format version"):
        _write_table(table, tempdir / 'test_version.parquet', version="2.2")
    with pytest.raises(ValueError, match="Unsupported Parquet data page " +
                       "version"):
        _write_table(table, tempdir / 'test_version.parquet',
                     data_page_version="2.2")
Ejemplo n.º 8
0
def test_special_chars_filename(tempdir, use_legacy_dataset):
    table = pa.Table.from_arrays([pa.array([42])], ["ints"])
    filename = "foo # bar"
    path = tempdir / filename
    assert not path.exists()
    _write_table(table, str(path))
    assert path.exists()
    table_read = _read_table(str(path), use_legacy_dataset=use_legacy_dataset)
    assert table_read.equals(table)
Ejemplo n.º 9
0
def test_decimal_roundtrip_negative_scale(tempdir):
    expected = pd.DataFrame({'decimal_num': [decimal.Decimal('1.23E4')]})
    filename = tempdir / 'decimals.parquet'
    string_filename = str(filename)
    t = pa.Table.from_pandas(expected)
    _write_table(t, string_filename)
    result_table = _read_table(string_filename)
    result = result_table.to_pandas()
    tm.assert_frame_equal(result, expected)
Ejemplo n.º 10
0
def test_column_of_arrays(tempdir):
    df, schema = dataframe_with_arrays()

    filename = tempdir / 'pandas_roundtrip.parquet'
    arrow_table = pa.Table.from_pandas(df, schema=schema)
    _write_table(arrow_table, filename, version="2.0", coerce_timestamps='ms')
    table_read = _read_table(filename)
    df_read = table_read.to_pandas()
    tm.assert_frame_equal(df, df_read)
Ejemplo n.º 11
0
def test_pandas_parquet_empty_roundtrip(tempdir, use_legacy_dataset):
    df = _test_dataframe(0)
    arrow_table = pa.Table.from_pandas(df)
    imos = pa.BufferOutputStream()
    _write_table(arrow_table, imos, version="2.0")
    buf = imos.getvalue()
    reader = pa.BufferReader(buf)
    df_read = _read_table(
        reader, use_legacy_dataset=use_legacy_dataset).to_pandas()
    tm.assert_frame_equal(df, df_read)
Ejemplo n.º 12
0
def test_column_of_lists(tempdir):
    df, schema = dataframe_with_lists(parquet_compatible=True)

    filename = tempdir / 'pandas_roundtrip.parquet'
    arrow_table = pa.Table.from_pandas(df, schema=schema)
    _write_table(arrow_table, filename, version='2.0')
    table_read = _read_table(filename)
    df_read = table_read.to_pandas()

    tm.assert_frame_equal(df, df_read)
Ejemplo n.º 13
0
def test_large_table_int32_overflow():
    size = np.iinfo('int32').max + 1

    arr = np.ones(size, dtype='uint8')

    parr = pa.array(arr, type=pa.uint8())

    table = pa.Table.from_arrays([parr], names=['one'])
    f = io.BytesIO()
    _write_table(table, f)
Ejemplo n.º 14
0
def test_large_binary_overflow():
    s = b'x' * (1 << 31)
    arr = pa.array([s], type=pa.large_binary())
    table = pa.Table.from_arrays([arr], names=['strs'])
    for use_dictionary in [False, True]:
        writer = pa.BufferOutputStream()
        with pytest.raises(
                pa.ArrowInvalid,
                match="Parquet cannot store strings with size 2GB or more"):
            _write_table(table, writer, use_dictionary=use_dictionary)
Ejemplo n.º 15
0
def test_compression_level(use_legacy_dataset):
    arr = pa.array(list(map(int, range(1000))))
    data = [arr, arr]
    table = pa.Table.from_arrays(data, names=['a', 'b'])

    # Check one compression level.
    _check_roundtrip(table,
                     expected=table,
                     compression="gzip",
                     compression_level=1,
                     use_legacy_dataset=use_legacy_dataset)

    # Check another one to make sure that compression_level=1 does not
    # coincide with the default one in Arrow.
    _check_roundtrip(table,
                     expected=table,
                     compression="gzip",
                     compression_level=5,
                     use_legacy_dataset=use_legacy_dataset)

    # Check that the user can provide a compression per column
    _check_roundtrip(table,
                     expected=table,
                     compression={
                         'a': "gzip",
                         'b': "snappy"
                     },
                     use_legacy_dataset=use_legacy_dataset)

    # Check that the user can provide a compression level per column
    _check_roundtrip(table,
                     expected=table,
                     compression="gzip",
                     compression_level={
                         'a': 2,
                         'b': 3
                     },
                     use_legacy_dataset=use_legacy_dataset)

    # Check that specifying a compression level for a codec which does allow
    # specifying one, results into an error.
    # Uncompressed, snappy, lz4 and lzo do not support specifying a compression
    # level.
    # GZIP (zlib) allows for specifying a compression level but as of up
    # to version 1.2.11 the valid range is [-1, 9].
    invalid_combinations = [("snappy", 4), ("lz4", 5), ("gzip", -1337),
                            ("None", 444), ("lzo", 14)]
    buf = io.BytesIO()
    for (codec, level) in invalid_combinations:
        with pytest.raises((ValueError, OSError)):
            _write_table(table,
                         buf,
                         compression=codec,
                         compression_level=level)
Ejemplo n.º 16
0
def test_read_pandas_passthrough_keywords(tempdir):
    # ARROW-11464 - previously not all keywords were passed through (such as
    # the filesystem keyword)
    df = pd.DataFrame({'a': [1, 2, 3]})

    filename = tempdir / 'data.parquet'
    _write_table(df, filename)

    result = pq.read_pandas('data.parquet',
                            filesystem=SubTreeFileSystem(
                                str(tempdir), LocalFileSystem()))
    assert result.equals(pa.table(df))
Ejemplo n.º 17
0
def test_pre_buffer(pre_buffer):
    N, K = 10000, 4
    df = alltypes_sample(size=N)
    a_table = pa.Table.from_pandas(df)

    buf = io.BytesIO()
    _write_table(a_table, buf, row_group_size=N / K,
                 compression='snappy', version='2.6')

    buf.seek(0)
    pf = pq.ParquetFile(buf, pre_buffer=pre_buffer)
    assert pf.read().num_rows == N
Ejemplo n.º 18
0
def test_read_pandas_column_subset(tempdir, use_legacy_dataset):
    df = _test_dataframe(10000)
    arrow_table = pa.Table.from_pandas(df)
    imos = pa.BufferOutputStream()
    _write_table(arrow_table, imos, version="2.0")
    buf = imos.getvalue()
    reader = pa.BufferReader(buf)
    df_read = pq.read_pandas(
        reader, columns=['strings', 'uint8'],
        use_legacy_dataset=use_legacy_dataset
    ).to_pandas()
    tm.assert_frame_equal(df[['strings', 'uint8']], df_read)
Ejemplo n.º 19
0
def test_memory_map(tempdir, use_legacy_dataset):
    df = alltypes_sample(size=10)

    table = pa.Table.from_pandas(df)
    _check_roundtrip(table, read_table_kwargs={'memory_map': True},
                     version='2.6', use_legacy_dataset=use_legacy_dataset)

    filename = str(tempdir / 'tmp_file')
    with open(filename, 'wb') as f:
        _write_table(table, f, version='2.6')
    table_read = pq.read_pandas(filename, memory_map=True,
                                use_legacy_dataset=use_legacy_dataset)
    assert table_read.equals(table)
Ejemplo n.º 20
0
def test_enable_buffered_stream(tempdir, use_legacy_dataset):
    df = alltypes_sample(size=10)

    table = pa.Table.from_pandas(df)
    _check_roundtrip(table, read_table_kwargs={'buffer_size': 1025},
                     version='2.6', use_legacy_dataset=use_legacy_dataset)

    filename = str(tempdir / 'tmp_file')
    with open(filename, 'wb') as f:
        _write_table(table, f, version='2.6')
    table_read = pq.read_pandas(filename, buffer_size=4096,
                                use_legacy_dataset=use_legacy_dataset)
    assert table_read.equals(table)
Ejemplo n.º 21
0
def test_single_pylist_column_roundtrip(tempdir, dtype, use_legacy_dataset):
    filename = tempdir / 'single_{}_column.parquet'.format(dtype.__name__)
    data = [pa.array(list(map(dtype, range(5))))]
    table = pa.Table.from_arrays(data, names=['a'])
    _write_table(table, filename)
    table_read = _read_table(filename, use_legacy_dataset=use_legacy_dataset)
    for i in range(table.num_columns):
        col_written = table[i]
        col_read = table_read[i]
        assert table.field(i).name == table_read.field(i).name
        assert col_read.num_chunks == 1
        data_written = col_written.chunk(0)
        data_read = col_read.chunk(0)
        assert data_written.equals(data_read)
Ejemplo n.º 22
0
def test_scan_contents():
    N, K = 10000, 4
    df = alltypes_sample(size=N)
    a_table = pa.Table.from_pandas(df)

    buf = io.BytesIO()
    _write_table(a_table, buf, row_group_size=N / K,
                 compression='snappy', version='2.0')

    buf.seek(0)
    pf = pq.ParquetFile(buf)

    assert pf.scan_contents() == 10000
    assert pf.scan_contents(df.columns[:4]) == 10000
Ejemplo n.º 23
0
def test_pandas_can_write_nested_data(tempdir):
    data = {
        "agg_col": [
            {"page_type": 1},
            {"record_type": 1},
            {"non_consecutive_home": 0},
        ],
        "uid_first": "1001"
    }
    df = pd.DataFrame(data=data)
    arrow_table = pa.Table.from_pandas(df)
    imos = pa.BufferOutputStream()
    # This succeeds under V2
    _write_table(arrow_table, imos)
Ejemplo n.º 24
0
def test_min_chunksize(use_legacy_dataset):
    data = pd.DataFrame([np.arange(4)], columns=['A', 'B', 'C', 'D'])
    table = pa.Table.from_pandas(data.reset_index())

    buf = io.BytesIO()
    _write_table(table, buf, chunk_size=-1)

    buf.seek(0)
    result = _read_table(buf, use_legacy_dataset=use_legacy_dataset)

    assert result.equals(table)

    with pytest.raises(ValueError):
        _write_table(table, buf, chunk_size=0)
Ejemplo n.º 25
0
def test_fspath(tempdir, use_legacy_dataset):
    # ARROW-12472 support __fspath__ objects without using str()
    path = tempdir / "test.parquet"
    table = pa.table({"a": [1, 2, 3]})
    _write_table(table, path)

    fs_protocol_obj = util.FSProtocolClass(path)

    result = _read_table(fs_protocol_obj,
                         use_legacy_dataset=use_legacy_dataset)
    assert result.equals(table)

    # combined with non-local filesystem raises
    with pytest.raises(TypeError):
        _read_table(fs_protocol_obj, filesystem=FileSystem())
Ejemplo n.º 26
0
def test_pandas_parquet_column_multiindex(tempdir, use_legacy_dataset):
    df = alltypes_sample(size=10)
    df.columns = pd.MultiIndex.from_tuples(list(
        zip(df.columns, df.columns[::-1])),
                                           names=['level_1', 'level_2'])

    filename = tempdir / 'pandas_roundtrip.parquet'
    arrow_table = pa.Table.from_pandas(df)
    assert arrow_table.schema.pandas_metadata is not None

    _write_table(arrow_table, filename, version='2.0', coerce_timestamps='ms')

    table_read = pq.read_pandas(filename,
                                use_legacy_dataset=use_legacy_dataset)
    df_read = table_read.to_pandas()
    tm.assert_frame_equal(df, df_read)
Ejemplo n.º 27
0
def test_multiple_path_types(tempdir, use_legacy_dataset):
    # Test compatibility with PEP 519 path-like objects
    path = tempdir / 'zzz.parquet'
    df = pd.DataFrame({'x': np.arange(10, dtype=np.int64)})
    _write_table(df, path)
    table_read = _read_table(path, use_legacy_dataset=use_legacy_dataset)
    df_read = table_read.to_pandas()
    tm.assert_frame_equal(df, df_read)

    # Test compatibility with plain string paths
    path = str(tempdir) + 'zzz.parquet'
    df = pd.DataFrame({'x': np.arange(10, dtype=np.int64)})
    _write_table(df, path)
    table_read = _read_table(path, use_legacy_dataset=use_legacy_dataset)
    df_read = table_read.to_pandas()
    tm.assert_frame_equal(df, df_read)
Ejemplo n.º 28
0
def test_multithreaded_read(use_legacy_dataset):
    df = alltypes_sample(size=10000)

    table = pa.Table.from_pandas(df)

    buf = io.BytesIO()
    _write_table(table, buf, compression='SNAPPY', version='2.6')

    buf.seek(0)
    table1 = _read_table(
        buf, use_threads=True, use_legacy_dataset=use_legacy_dataset)

    buf.seek(0)
    table2 = _read_table(
        buf, use_threads=False, use_legacy_dataset=use_legacy_dataset)

    assert table1.equals(table2)
Ejemplo n.º 29
0
def test_pandas_parquet_custom_metadata(tempdir):
    df = alltypes_sample(size=10000)

    filename = tempdir / 'pandas_roundtrip.parquet'
    arrow_table = pa.Table.from_pandas(df)
    assert b'pandas' in arrow_table.schema.metadata

    _write_table(arrow_table, filename, version='2.0', coerce_timestamps='ms')

    metadata = pq.read_metadata(filename).metadata
    assert b'pandas' in metadata

    js = json.loads(metadata[b'pandas'].decode('utf8'))
    assert js['index_columns'] == [{'kind': 'range',
                                    'name': None,
                                    'start': 0, 'stop': 10000,
                                    'step': 1}]
Ejemplo n.º 30
0
def test_pass_separate_metadata():
    # ARROW-471
    df = alltypes_sample(size=10000)

    a_table = pa.Table.from_pandas(df)

    buf = io.BytesIO()
    _write_table(a_table, buf, compression='snappy', version='2.0')

    buf.seek(0)
    metadata = pq.read_metadata(buf)

    buf.seek(0)

    fileh = pq.ParquetFile(buf, metadata=metadata)

    tm.assert_frame_equal(df, fileh.read().to_pandas())