Example #1
0
def test_pandas_parquet_2_0_rountrip(tmpdir):
    size = 10000
    np.random.seed(0)
    df = pd.DataFrame({
        'uint8': np.arange(size, dtype=np.uint8),
        'uint16': np.arange(size, dtype=np.uint16),
        'uint32': np.arange(size, dtype=np.uint32),
        'uint64': np.arange(size, dtype=np.uint64),
        'int8': np.arange(size, dtype=np.int16),
        'int16': np.arange(size, dtype=np.int16),
        'int32': np.arange(size, dtype=np.int32),
        'int64': np.arange(size, dtype=np.int64),
        'float32': np.arange(size, dtype=np.float32),
        'float64': np.arange(size, dtype=np.float64),
        'bool': np.random.randn(size) > 0,
        # Pandas only support ns resolution, Arrow at the moment only ms
        'datetime': np.arange("2016-01-01T00:00:00.001", size,
                              dtype='datetime64[ms]'),
        'str': [str(x) for x in range(size)],
        'str_with_nulls': [None] + [str(x) for x in range(size - 2)] + [None],
        'empty_str': [''] * size
    })
    filename = tmpdir.join('pandas_rountrip.parquet')
    arrow_table = A.from_pandas_dataframe(df, timestamps_to_ms=True)
    A.parquet.write_table(arrow_table, filename.strpath, version="2.0")
    table_read = pq.read_table(filename.strpath)
    df_read = table_read.to_pandas()
    pdt.assert_frame_equal(df, df_read)
Example #2
0
def test_pandas_parquet_1_0_rountrip(tmpdir):
    size = 10000
    np.random.seed(0)
    df = pd.DataFrame({
        'uint8': np.arange(size, dtype=np.uint8),
        'uint16': np.arange(size, dtype=np.uint16),
        'uint32': np.arange(size, dtype=np.uint32),
        'uint64': np.arange(size, dtype=np.uint64),
        'int8': np.arange(size, dtype=np.int16),
        'int16': np.arange(size, dtype=np.int16),
        'int32': np.arange(size, dtype=np.int32),
        'int64': np.arange(size, dtype=np.int64),
        'float32': np.arange(size, dtype=np.float32),
        'float64': np.arange(size, dtype=np.float64),
        'bool': np.random.randn(size) > 0,
        'str': [str(x) for x in range(size)],
        'str_with_nulls': [None] + [str(x) for x in range(size - 2)] + [None],
        'empty_str': [''] * size
    })
    filename = tmpdir.join('pandas_rountrip.parquet')
    arrow_table = A.from_pandas_dataframe(df)
    A.parquet.write_table(arrow_table, filename.strpath, version="1.0")
    table_read = pq.read_table(filename.strpath)
    df_read = table_read.to_pandas()

    # We pass uint32_t as int64_t if we write Parquet version 1.0
    df['uint32'] = df['uint32'].values.astype(np.int64)

    pdt.assert_frame_equal(df, df_read)
Example #3
0
def test_pandas_parquet_2_0_rountrip(tmpdir):
    size = 10000
    np.random.seed(0)
    df = pd.DataFrame({
        'uint8': np.arange(size, dtype=np.uint8),
        'uint16': np.arange(size, dtype=np.uint16),
        'uint32': np.arange(size, dtype=np.uint32),
        'uint64': np.arange(size, dtype=np.uint64),
        'int8': np.arange(size, dtype=np.int16),
        'int16': np.arange(size, dtype=np.int16),
        'int32': np.arange(size, dtype=np.int32),
        'int64': np.arange(size, dtype=np.int64),
        'float32': np.arange(size, dtype=np.float32),
        'float64': np.arange(size, dtype=np.float64),
        'bool': np.random.randn(size) > 0,
        # Pandas only support ns resolution, Arrow at the moment only ms
        'datetime': np.arange("2016-01-01T00:00:00.001", size,
                              dtype='datetime64[ms]'),
        'str': [str(x) for x in range(size)],
        'str_with_nulls': [None] + [str(x) for x in range(size - 2)] + [None],
        'empty_str': [''] * size
    })
    filename = tmpdir.join('pandas_rountrip.parquet')
    arrow_table = A.from_pandas_dataframe(df, timestamps_to_ms=True)
    A.parquet.write_table(arrow_table, filename.strpath, version="2.0")
    table_read = pq.read_table(filename.strpath)
    df_read = table_read.to_pandas()
    pdt.assert_frame_equal(df, df_read)
Example #4
0
def test_pandas_parquet_1_0_rountrip(tmpdir):
    size = 10000
    np.random.seed(0)
    df = pd.DataFrame({
        'uint8': np.arange(size, dtype=np.uint8),
        'uint16': np.arange(size, dtype=np.uint16),
        'uint32': np.arange(size, dtype=np.uint32),
        'uint64': np.arange(size, dtype=np.uint64),
        'int8': np.arange(size, dtype=np.int16),
        'int16': np.arange(size, dtype=np.int16),
        'int32': np.arange(size, dtype=np.int32),
        'int64': np.arange(size, dtype=np.int64),
        'float32': np.arange(size, dtype=np.float32),
        'float64': np.arange(size, dtype=np.float64),
        'bool': np.random.randn(size) > 0,
        'str': [str(x) for x in range(size)],
        'str_with_nulls': [None] + [str(x) for x in range(size - 2)] + [None],
        'empty_str': [''] * size
    })
    filename = tmpdir.join('pandas_rountrip.parquet')
    arrow_table = A.from_pandas_dataframe(df)
    A.parquet.write_table(arrow_table, filename.strpath, version="1.0")
    table_read = pq.read_table(filename.strpath)
    df_read = table_read.to_pandas()

    # We pass uint32_t as int64_t if we write Parquet version 1.0
    df['uint32'] = df['uint32'].values.astype(np.int64)

    pdt.assert_frame_equal(df, df_read)
Example #5
0
 def _check_pandas_roundtrip(self, df, expected=None,
                             timestamps_to_ms=False):
     table = A.from_pandas_dataframe(df, timestamps_to_ms=timestamps_to_ms)
     result = table.to_pandas()
     if expected is None:
         expected = df
     tm.assert_frame_equal(result, expected)
Example #6
0
def test_pandas_parquet_native_file_roundtrip(tmpdir):
    df = _test_dataframe(10000)
    arrow_table = A.from_pandas_dataframe(df)
    imos = paio.InMemoryOutputStream()
    pq.write_table(arrow_table, imos, version="2.0")
    buf = imos.get_result()
    reader = paio.BufferReader(buf)
    df_read = pq.read_table(reader).to_pandas()
    pdt.assert_frame_equal(df, df_read)
Example #7
0
 def _check_pandas_roundtrip(self,
                             df,
                             expected=None,
                             timestamps_to_ms=False):
     table = A.from_pandas_dataframe(df, timestamps_to_ms=timestamps_to_ms)
     result = table.to_pandas()
     if expected is None:
         expected = df
     tm.assert_frame_equal(result, expected)
Example #8
0
def test_pandas_parquet_native_file_roundtrip(tmpdir):
    df = _test_dataframe(10000)
    arrow_table = A.from_pandas_dataframe(df)
    imos = paio.InMemoryOutputStream()
    pq.write_table(arrow_table, imos, version="2.0")
    buf = imos.get_result()
    reader = paio.BufferReader(buf)
    df_read = pq.read_table(reader).to_pandas()
    pdt.assert_frame_equal(df, df_read)
Example #9
0
    def test_bytes_to_binary(self):
        values = [u('qux'), b'foo', None, 'bar', 'qux', np.nan]
        df = pd.DataFrame({'strings': values})

        table = A.from_pandas_dataframe(df)
        assert table[0].type == A.binary()

        values2 = [b'qux', b'foo', None, b'bar', b'qux', np.nan]
        expected = pd.DataFrame({'strings': values2})
        self._check_pandas_roundtrip(df, expected)
Example #10
0
 def test_date(self):
     df = pd.DataFrame({
         'date': [
             datetime.date(2000, 1, 1), None,
             datetime.date(1970, 1, 1),
             datetime.date(2040, 2, 26)
         ]
     })
     table = A.from_pandas_dataframe(df)
     result = table.to_pandas()
     expected = df.copy()
     expected['date'] = pd.to_datetime(df['date'])
     tm.assert_frame_equal(result, expected)
Example #11
0
def test_pandas_column_selection(tmpdir):
    size = 10000
    np.random.seed(0)
    df = pd.DataFrame({
        'uint8': np.arange(size, dtype=np.uint8),
        'uint16': np.arange(size, dtype=np.uint16)
    })
    filename = tmpdir.join('pandas_rountrip.parquet')
    arrow_table = A.from_pandas_dataframe(df)
    A.parquet.write_table(arrow_table, filename.strpath)
    table_read = pq.read_table(filename.strpath, columns=['uint8'])
    df_read = table_read.to_pandas()

    pdt.assert_frame_equal(df[['uint8']], df_read)
Example #12
0
def test_pandas_column_selection(tmpdir):
    size = 10000
    np.random.seed(0)
    df = pd.DataFrame({
        'uint8': np.arange(size, dtype=np.uint8),
        'uint16': np.arange(size, dtype=np.uint16)
    })
    filename = tmpdir.join('pandas_rountrip.parquet')
    arrow_table = A.from_pandas_dataframe(df)
    A.parquet.write_table(arrow_table, filename.strpath)
    table_read = pq.read_table(filename.strpath, columns=['uint8'])
    df_read = table_read.to_pandas()

    pdt.assert_frame_equal(df[['uint8']], df_read)
Example #13
0
def test_pandas_parquet_pyfile_roundtrip(tmpdir):
    filename = tmpdir.join('pandas_pyfile_roundtrip.parquet').strpath
    size = 5
    df = pd.DataFrame({
        'int64': np.arange(size, dtype=np.int64),
        'float32': np.arange(size, dtype=np.float32),
        'float64': np.arange(size, dtype=np.float64),
        'bool': np.random.randn(size) > 0,
        'strings': ['foo', 'bar', None, 'baz', 'qux']
    })

    arrow_table = A.from_pandas_dataframe(df)

    with open(filename, 'wb') as f:
        A.parquet.write_table(arrow_table, f, version="1.0")

    data = io.BytesIO(open(filename, 'rb').read())

    table_read = pq.read_table(data)
    df_read = table_read.to_pandas()
    pdt.assert_frame_equal(df, df_read)
Example #14
0
def test_pandas_parquet_pyfile_roundtrip(tmpdir):
    filename = tmpdir.join('pandas_pyfile_roundtrip.parquet').strpath
    size = 5
    df = pd.DataFrame({
        'int64': np.arange(size, dtype=np.int64),
        'float32': np.arange(size, dtype=np.float32),
        'float64': np.arange(size, dtype=np.float64),
        'bool': np.random.randn(size) > 0,
        'strings': ['foo', 'bar', None, 'baz', 'qux']
    })

    arrow_table = A.from_pandas_dataframe(df)

    with open(filename, 'wb') as f:
        A.parquet.write_table(arrow_table, f, version="1.0")

    data = io.BytesIO(open(filename, 'rb').read())

    table_read = pq.read_table(data)
    df_read = table_read.to_pandas()
    pdt.assert_frame_equal(df, df_read)
Example #15
0
def test_pandas_parquet_configuration_options(tmpdir):
    size = 10000
    np.random.seed(0)
    df = pd.DataFrame({
        'uint8': np.arange(size, dtype=np.uint8),
        'uint16': np.arange(size, dtype=np.uint16),
        'uint32': np.arange(size, dtype=np.uint32),
        'uint64': np.arange(size, dtype=np.uint64),
        'int8': np.arange(size, dtype=np.int16),
        'int16': np.arange(size, dtype=np.int16),
        'int32': np.arange(size, dtype=np.int32),
        'int64': np.arange(size, dtype=np.int64),
        'float32': np.arange(size, dtype=np.float32),
        'float64': np.arange(size, dtype=np.float64),
        'bool': np.random.randn(size) > 0
    })
    filename = tmpdir.join('pandas_rountrip.parquet')
    arrow_table = A.from_pandas_dataframe(df)

    for use_dictionary in [True, False]:
        A.parquet.write_table(
                arrow_table,
                filename.strpath,
                version="2.0",
                use_dictionary=use_dictionary)
        table_read = pq.read_table(filename.strpath)
        df_read = table_read.to_pandas()
        pdt.assert_frame_equal(df, df_read)

    for compression in ['NONE', 'SNAPPY', 'GZIP']:
        A.parquet.write_table(
                arrow_table,
                filename.strpath,
                version="2.0",
                compression=compression)
        table_read = pq.read_table(filename.strpath)
        df_read = table_read.to_pandas()
        pdt.assert_frame_equal(df, df_read)
Example #16
0
def test_pandas_parquet_configuration_options(tmpdir):
    size = 10000
    np.random.seed(0)
    df = pd.DataFrame({
        'uint8': np.arange(size, dtype=np.uint8),
        'uint16': np.arange(size, dtype=np.uint16),
        'uint32': np.arange(size, dtype=np.uint32),
        'uint64': np.arange(size, dtype=np.uint64),
        'int8': np.arange(size, dtype=np.int16),
        'int16': np.arange(size, dtype=np.int16),
        'int32': np.arange(size, dtype=np.int32),
        'int64': np.arange(size, dtype=np.int64),
        'float32': np.arange(size, dtype=np.float32),
        'float64': np.arange(size, dtype=np.float64),
        'bool': np.random.randn(size) > 0
    })
    filename = tmpdir.join('pandas_rountrip.parquet')
    arrow_table = A.from_pandas_dataframe(df)

    for use_dictionary in [True, False]:
        A.parquet.write_table(
                arrow_table,
                filename.strpath,
                version="2.0",
                use_dictionary=use_dictionary)
        table_read = pq.read_table(filename.strpath)
        df_read = table_read.to_pandas()
        pdt.assert_frame_equal(df, df_read)

    for compression in ['NONE', 'SNAPPY', 'GZIP']:
        A.parquet.write_table(
                arrow_table,
                filename.strpath,
                version="2.0",
                compression=compression)
        table_read = pq.read_table(filename.strpath)
        df_read = table_read.to_pandas()
        pdt.assert_frame_equal(df, df_read)
Example #17
0
def test_pandas_parquet_native_file_roundtrip(tmpdir):
    size = 10000
    np.random.seed(0)
    df = pd.DataFrame({
        'uint8': np.arange(size, dtype=np.uint8),
        'uint16': np.arange(size, dtype=np.uint16),
        'uint32': np.arange(size, dtype=np.uint32),
        'uint64': np.arange(size, dtype=np.uint64),
        'int8': np.arange(size, dtype=np.int16),
        'int16': np.arange(size, dtype=np.int16),
        'int32': np.arange(size, dtype=np.int32),
        'int64': np.arange(size, dtype=np.int64),
        'float32': np.arange(size, dtype=np.float32),
        'float64': np.arange(size, dtype=np.float64),
        'bool': np.random.randn(size) > 0
    })
    arrow_table = A.from_pandas_dataframe(df)
    imos = paio.InMemoryOutputStream()
    pq.write_table(arrow_table, imos, version="2.0")
    buf = imos.get_result()
    reader = paio.BufferReader(buf)
    df_read = pq.read_table(reader).to_pandas()
    pdt.assert_frame_equal(df, df_read)
Example #18
0
# If you have any questions, suggestions, or comments on this example,
# please use the HDF-EOS Forum (http://hdfeos.org/forums).
#
# If you would like to see an example of any other NASA HDF/HDF-EOS data
# product, feel free to contact us at [email protected] or
# post it at the HDF-EOS Forum (http://hdfeos.org/forums).
#
# This script was tested on Mac OS X Mavericks machine with the latest
# parquet and arrow compiled from GitHub repository.
#
# Last tested: 9/22/2016
# Author: Hyo-Kyung Lee
import pyarrow as A
import pyarrow.parquet as pq
import pandas as pd
import h5py

FILE_NAME='/tmp/GSSTF_NCEP.3.1987.07.01.he5'
with h5py.File(FILE_NAME, mode='r') as f:
    dset_var = f['/HDFEOS/GRIDS/NCEP/Data Fields/SST']
    values = dset_var[0,:]
data = {}
data['i4'] = values.astype('i4')
filename='GSSTF.parquet'
df=pd.DataFrame(data)
arrow_table = A.from_pandas_dataframe(df)
A.parquet.write_table(arrow_table, filename, version="2.0")
table_read = pq.read_table(filename)
df_read = table_read.to_pandas()
print(df_read)
Example #19
0
 def time_from_series(self, n, dtype):
     A.from_pandas_dataframe(self.data)
Example #20
0
 def time_from_series(self, n, dtype):
     A.from_pandas_dataframe(self.data)
Example #21
0
 def setup(self, n, dtype):
     super(PandasConversionsFromArrow, self).setup(n, dtype)
     self.arrow_data = A.from_pandas_dataframe(self.data)
Example #22
0
 def setup(self, n, dtype):
     super(PandasConversionsFromArrow, self).setup(n, dtype)
     self.arrow_data = A.from_pandas_dataframe(self.data)
Example #23
0
 def peakmem_from_series(self, n, dtype):
     A.from_pandas_dataframe(self.data)
Example #24
0
 def peakmem_from_series(self, n, dtype):
     A.from_pandas_dataframe(self.data)
Example #25
0
# If you have any questions, suggestions, or comments on this example,
# please use the HDF-EOS Forum (http://hdfeos.org/forums).
#
# If you would like to see an example of any other NASA HDF/HDF-EOS data
# product, feel free to contact us at [email protected] or
# post it at the HDF-EOS Forum (http://hdfeos.org/forums).
#
# This script was tested on Mac OS X Mavericks machine with the latest
# parquet and arrow compiled from GitHub repository.
#
# Last tested: 9/22/2016
# Author: Hyo-Kyung Lee
import pyarrow as A
import pyarrow.parquet as pq
import pandas as pd
import h5py

FILE_NAME = '/tmp/GSSTF_NCEP.3.1987.07.01.he5'
with h5py.File(FILE_NAME, mode='r') as f:
    dset_var = f['/HDFEOS/GRIDS/NCEP/Data Fields/SST']
    values = dset_var[0, :]
data = {}
data['i4'] = values.astype('i4')
filename = 'GSSTF.parquet'
df = pd.DataFrame(data)
arrow_table = A.from_pandas_dataframe(df)
A.parquet.write_table(arrow_table, filename, version="2.0")
table_read = pq.read_table(filename)
df_read = table_read.to_pandas()
print(df_read)