def test_pandas_parquet_2_0_rountrip(tmpdir): size = 10000 np.random.seed(0) df = pd.DataFrame({ 'uint8': np.arange(size, dtype=np.uint8), 'uint16': np.arange(size, dtype=np.uint16), 'uint32': np.arange(size, dtype=np.uint32), 'uint64': np.arange(size, dtype=np.uint64), 'int8': np.arange(size, dtype=np.int16), 'int16': np.arange(size, dtype=np.int16), 'int32': np.arange(size, dtype=np.int32), 'int64': np.arange(size, dtype=np.int64), 'float32': np.arange(size, dtype=np.float32), 'float64': np.arange(size, dtype=np.float64), 'bool': np.random.randn(size) > 0, # Pandas only support ns resolution, Arrow at the moment only ms 'datetime': np.arange("2016-01-01T00:00:00.001", size, dtype='datetime64[ms]'), 'str': [str(x) for x in range(size)], 'str_with_nulls': [None] + [str(x) for x in range(size - 2)] + [None], 'empty_str': [''] * size }) filename = tmpdir.join('pandas_rountrip.parquet') arrow_table = A.from_pandas_dataframe(df, timestamps_to_ms=True) A.parquet.write_table(arrow_table, filename.strpath, version="2.0") table_read = pq.read_table(filename.strpath) df_read = table_read.to_pandas() pdt.assert_frame_equal(df, df_read)
def test_pandas_parquet_1_0_rountrip(tmpdir): size = 10000 np.random.seed(0) df = pd.DataFrame({ 'uint8': np.arange(size, dtype=np.uint8), 'uint16': np.arange(size, dtype=np.uint16), 'uint32': np.arange(size, dtype=np.uint32), 'uint64': np.arange(size, dtype=np.uint64), 'int8': np.arange(size, dtype=np.int16), 'int16': np.arange(size, dtype=np.int16), 'int32': np.arange(size, dtype=np.int32), 'int64': np.arange(size, dtype=np.int64), 'float32': np.arange(size, dtype=np.float32), 'float64': np.arange(size, dtype=np.float64), 'bool': np.random.randn(size) > 0, 'str': [str(x) for x in range(size)], 'str_with_nulls': [None] + [str(x) for x in range(size - 2)] + [None], 'empty_str': [''] * size }) filename = tmpdir.join('pandas_rountrip.parquet') arrow_table = A.from_pandas_dataframe(df) A.parquet.write_table(arrow_table, filename.strpath, version="1.0") table_read = pq.read_table(filename.strpath) df_read = table_read.to_pandas() # We pass uint32_t as int64_t if we write Parquet version 1.0 df['uint32'] = df['uint32'].values.astype(np.int64) pdt.assert_frame_equal(df, df_read)
def _check_pandas_roundtrip(self, df, expected=None, timestamps_to_ms=False): table = A.from_pandas_dataframe(df, timestamps_to_ms=timestamps_to_ms) result = table.to_pandas() if expected is None: expected = df tm.assert_frame_equal(result, expected)
def test_pandas_parquet_native_file_roundtrip(tmpdir): df = _test_dataframe(10000) arrow_table = A.from_pandas_dataframe(df) imos = paio.InMemoryOutputStream() pq.write_table(arrow_table, imos, version="2.0") buf = imos.get_result() reader = paio.BufferReader(buf) df_read = pq.read_table(reader).to_pandas() pdt.assert_frame_equal(df, df_read)
def test_bytes_to_binary(self): values = [u('qux'), b'foo', None, 'bar', 'qux', np.nan] df = pd.DataFrame({'strings': values}) table = A.from_pandas_dataframe(df) assert table[0].type == A.binary() values2 = [b'qux', b'foo', None, b'bar', b'qux', np.nan] expected = pd.DataFrame({'strings': values2}) self._check_pandas_roundtrip(df, expected)
def test_date(self): df = pd.DataFrame({ 'date': [ datetime.date(2000, 1, 1), None, datetime.date(1970, 1, 1), datetime.date(2040, 2, 26) ] }) table = A.from_pandas_dataframe(df) result = table.to_pandas() expected = df.copy() expected['date'] = pd.to_datetime(df['date']) tm.assert_frame_equal(result, expected)
def test_pandas_column_selection(tmpdir): size = 10000 np.random.seed(0) df = pd.DataFrame({ 'uint8': np.arange(size, dtype=np.uint8), 'uint16': np.arange(size, dtype=np.uint16) }) filename = tmpdir.join('pandas_rountrip.parquet') arrow_table = A.from_pandas_dataframe(df) A.parquet.write_table(arrow_table, filename.strpath) table_read = pq.read_table(filename.strpath, columns=['uint8']) df_read = table_read.to_pandas() pdt.assert_frame_equal(df[['uint8']], df_read)
def test_pandas_parquet_pyfile_roundtrip(tmpdir): filename = tmpdir.join('pandas_pyfile_roundtrip.parquet').strpath size = 5 df = pd.DataFrame({ 'int64': np.arange(size, dtype=np.int64), 'float32': np.arange(size, dtype=np.float32), 'float64': np.arange(size, dtype=np.float64), 'bool': np.random.randn(size) > 0, 'strings': ['foo', 'bar', None, 'baz', 'qux'] }) arrow_table = A.from_pandas_dataframe(df) with open(filename, 'wb') as f: A.parquet.write_table(arrow_table, f, version="1.0") data = io.BytesIO(open(filename, 'rb').read()) table_read = pq.read_table(data) df_read = table_read.to_pandas() pdt.assert_frame_equal(df, df_read)
def test_pandas_parquet_configuration_options(tmpdir): size = 10000 np.random.seed(0) df = pd.DataFrame({ 'uint8': np.arange(size, dtype=np.uint8), 'uint16': np.arange(size, dtype=np.uint16), 'uint32': np.arange(size, dtype=np.uint32), 'uint64': np.arange(size, dtype=np.uint64), 'int8': np.arange(size, dtype=np.int16), 'int16': np.arange(size, dtype=np.int16), 'int32': np.arange(size, dtype=np.int32), 'int64': np.arange(size, dtype=np.int64), 'float32': np.arange(size, dtype=np.float32), 'float64': np.arange(size, dtype=np.float64), 'bool': np.random.randn(size) > 0 }) filename = tmpdir.join('pandas_rountrip.parquet') arrow_table = A.from_pandas_dataframe(df) for use_dictionary in [True, False]: A.parquet.write_table( arrow_table, filename.strpath, version="2.0", use_dictionary=use_dictionary) table_read = pq.read_table(filename.strpath) df_read = table_read.to_pandas() pdt.assert_frame_equal(df, df_read) for compression in ['NONE', 'SNAPPY', 'GZIP']: A.parquet.write_table( arrow_table, filename.strpath, version="2.0", compression=compression) table_read = pq.read_table(filename.strpath) df_read = table_read.to_pandas() pdt.assert_frame_equal(df, df_read)
def test_pandas_parquet_native_file_roundtrip(tmpdir): size = 10000 np.random.seed(0) df = pd.DataFrame({ 'uint8': np.arange(size, dtype=np.uint8), 'uint16': np.arange(size, dtype=np.uint16), 'uint32': np.arange(size, dtype=np.uint32), 'uint64': np.arange(size, dtype=np.uint64), 'int8': np.arange(size, dtype=np.int16), 'int16': np.arange(size, dtype=np.int16), 'int32': np.arange(size, dtype=np.int32), 'int64': np.arange(size, dtype=np.int64), 'float32': np.arange(size, dtype=np.float32), 'float64': np.arange(size, dtype=np.float64), 'bool': np.random.randn(size) > 0 }) arrow_table = A.from_pandas_dataframe(df) imos = paio.InMemoryOutputStream() pq.write_table(arrow_table, imos, version="2.0") buf = imos.get_result() reader = paio.BufferReader(buf) df_read = pq.read_table(reader).to_pandas() pdt.assert_frame_equal(df, df_read)
# If you have any questions, suggestions, or comments on this example, # please use the HDF-EOS Forum (http://hdfeos.org/forums). # # If you would like to see an example of any other NASA HDF/HDF-EOS data # product, feel free to contact us at [email protected] or # post it at the HDF-EOS Forum (http://hdfeos.org/forums). # # This script was tested on Mac OS X Mavericks machine with the latest # parquet and arrow compiled from GitHub repository. # # Last tested: 9/22/2016 # Author: Hyo-Kyung Lee import pyarrow as A import pyarrow.parquet as pq import pandas as pd import h5py FILE_NAME='/tmp/GSSTF_NCEP.3.1987.07.01.he5' with h5py.File(FILE_NAME, mode='r') as f: dset_var = f['/HDFEOS/GRIDS/NCEP/Data Fields/SST'] values = dset_var[0,:] data = {} data['i4'] = values.astype('i4') filename='GSSTF.parquet' df=pd.DataFrame(data) arrow_table = A.from_pandas_dataframe(df) A.parquet.write_table(arrow_table, filename, version="2.0") table_read = pq.read_table(filename) df_read = table_read.to_pandas() print(df_read)
def time_from_series(self, n, dtype): A.from_pandas_dataframe(self.data)
def setup(self, n, dtype): super(PandasConversionsFromArrow, self).setup(n, dtype) self.arrow_data = A.from_pandas_dataframe(self.data)
def peakmem_from_series(self, n, dtype): A.from_pandas_dataframe(self.data)
# If you have any questions, suggestions, or comments on this example, # please use the HDF-EOS Forum (http://hdfeos.org/forums). # # If you would like to see an example of any other NASA HDF/HDF-EOS data # product, feel free to contact us at [email protected] or # post it at the HDF-EOS Forum (http://hdfeos.org/forums). # # This script was tested on Mac OS X Mavericks machine with the latest # parquet and arrow compiled from GitHub repository. # # Last tested: 9/22/2016 # Author: Hyo-Kyung Lee import pyarrow as A import pyarrow.parquet as pq import pandas as pd import h5py FILE_NAME = '/tmp/GSSTF_NCEP.3.1987.07.01.he5' with h5py.File(FILE_NAME, mode='r') as f: dset_var = f['/HDFEOS/GRIDS/NCEP/Data Fields/SST'] values = dset_var[0, :] data = {} data['i4'] = values.astype('i4') filename = 'GSSTF.parquet' df = pd.DataFrame(data) arrow_table = A.from_pandas_dataframe(df) A.parquet.write_table(arrow_table, filename, version="2.0") table_read = pq.read_table(filename) df_read = table_read.to_pandas() print(df_read)