def test_groups_roundtrip(tempdir): df = pd.DataFrame( { "a": np.random.choice(["a", "b", None], size=1000), "b": np.random.randint(0, 64000, size=1000), "c": np.random.choice([True, False], size=1000), } ) writer.write(tempdir, df, partition_on=["a", "c"], file_scheme="hive") r = ParquetFile(tempdir) assert r.columns == ["b"] out = r.to_pandas() for i, row in out.iterrows(): assert row.b in list(df[(df.a == row.a) & (df.c == row.c)].b) writer.write(tempdir, df, row_group_offsets=[0, 50], partition_on=["a", "c"], file_scheme="hive") r = ParquetFile(tempdir) assert r.count == sum(~df.a.isnull()) assert len(r.row_groups) == 8 out = r.to_pandas() for i, row in out.iterrows(): assert row.b in list(df[(df.a == row.a) & (df.c == row.c)].b)
def test_empty_row_group(tempdir): fname = os.path.join(tempdir, 'temp.parq') data = pd.DataFrame({'o': np.random.choice(['hello', 'world'], size=1000)}) writer.write(fname, data, row_group_offsets=[0, 900, 1800]) pf = ParquetFile(fname) assert len(pf.row_groups) == 2
def test_groups_roundtrip(tempdir, scheme): df = pd.DataFrame({'a': np.random.choice(['a', 'b', None], size=1000), 'b': np.random.randint(0, 64000, size=1000), 'c': np.random.choice([True, False], size=1000)}) writer.write(tempdir, df, partition_on=['a', 'c'], file_scheme=scheme) r = ParquetFile(tempdir) assert r.columns == ['b'] out = r.to_pandas() if scheme == 'drill': assert set(r.cats) == {'dir0', 'dir1'} assert set(out.columns) == {'b', 'dir0', 'dir1'} out.rename(columns={'dir0': 'a', 'dir1': 'c'}, inplace=True) for i, row in out.iterrows(): assert row.b in list(df[(df.a == row.a) & (df.c == row.c)].b) writer.write(tempdir, df, row_group_offsets=[0, 50], partition_on=['a', 'c'], file_scheme=scheme) r = ParquetFile(tempdir) assert r.fmd.num_rows == r.count() == sum(~df.a.isnull()) assert len(r.row_groups) == 8 out = r.to_pandas() if scheme == 'drill': assert set(out.columns) == {'b', 'dir0', 'dir1'} out.rename(columns={'dir0': 'a', 'dir1': 'c'}, inplace=True) for i, row in out.iterrows(): assert row.b in list(df[(df.a==row.a)&(df.c==row.c)].b)
def test_groups_roundtrip(tempdir): df = pd.DataFrame({ 'a': np.random.choice(['a', 'b', None], size=1000), 'b': np.random.randint(0, 64000, size=1000), 'c': np.random.choice([True, False], size=1000) }) writer.write(tempdir, df, partition_on=['a', 'c'], file_scheme='hive') r = ParquetFile(tempdir) assert r.columns == ['b'] out = r.to_pandas() for i, row in out.iterrows(): assert row.b in list(df[(df.a == row.a) & (df.c == row.c)].b) writer.write(tempdir, df, row_group_offsets=[0, 50], partition_on=['a', 'c'], file_scheme='hive') r = ParquetFile(tempdir) assert r.count == sum(~df.a.isnull()) assert len(r.row_groups) == 8 out = r.to_pandas() for i, row in out.iterrows(): assert row.b in list(df[(df.a == row.a) & (df.c == row.c)].b)
def test_too_many_partition_columns(tempdir): df = pd.DataFrame({ 'a': np.random.choice(['a', 'b', 'c'], size=1000), 'c': np.random.choice([True, False], size=1000) }) with pytest.raises(ValueError) as ve: writer.write(tempdir, df, partition_on=['a', 'c'], file_scheme='hive') assert "Cannot include all columns" in str(ve.value)
def test_index(tempdir): fn = os.path.join(tempdir, "tmp.parq") df = pd.DataFrame({"x": [1, 2, 3], "y": [1.0, 2.0, 3.0]}, index=pd.Index([10, 20, 30], name="z")) writer.write(fn, df) r = ParquetFile(fn) assert set(r.columns) == {"x", "y", "z"}
def test_write_compression_dict(tempdir, compression): df = pd.DataFrame({"x": [1, 2, 3], "y": [1.0, 2.0, 3.0]}) fn = os.path.join(tempdir, "tmp.parq") writer.write(fn, df, compression=compression) r = ParquetFile(fn) df2 = r.to_pandas() tm.assert_frame_equal(df, df2, check_categorical=False)
def test_dotted_column(tempdir): fn = os.path.join(tempdir, 'tmp.parq') df = pd.DataFrame({'x.y': [1, 2, 3], 'y': [1., 2., 3.]}) writer.write(fn, df) out = ParquetFile(fn).to_pandas() assert list(out.columns) == ['x.y', 'y']
def test_write_compression_dict(tempdir, compression): df = pd.DataFrame({'x': [1, 2, 3], 'y': [1., 2., 3.]}) fn = os.path.join(tempdir, 'tmp.parq') writer.write(fn, df, compression=compression) r = ParquetFile(fn) df2 = r.to_pandas() tm.assert_frame_equal(df, df2, check_categorical=False)
def test_index(tempdir): fn = os.path.join(tempdir, 'tmp.parq') df = pd.DataFrame({'x': [1, 2, 3], 'y': [1., 2., 3.]}, index=pd.Index([10, 20, 30], name='z')) writer.write(fn, df) r = ParquetFile(fn) assert set(r.columns) == {'x', 'y', 'z'}
def test_write_compression_schema(tempdir): df = pd.DataFrame({'x': [1, 2, 3], 'y': [1., 2., 3.]}) fn = os.path.join(tempdir, 'tmp.parq') writer.write(fn, df, compression={'x': 'gzip'}) r = ParquetFile(fn) assert all(c.meta_data.codec for row in r.row_groups for c in row.columns if c.meta_data.path_in_schema == ['x']) assert not any(c.meta_data.codec for row in r.row_groups for c in row.columns if c.meta_data.path_in_schema == ['y'])
def test_write_compression_schema(tempdir): df = pd.DataFrame({"x": [1, 2, 3], "y": [1.0, 2.0, 3.0]}) fn = os.path.join(tempdir, "tmp.parq") writer.write(fn, df, compression={"x": "gzip"}) r = ParquetFile(fn) assert all(c.meta_data.codec for row in r.row_groups for c in row.columns if c.meta_data.path_in_schema == ["x"]) assert not any( c.meta_data.codec for row in r.row_groups for c in row.columns if c.meta_data.path_in_schema == ["y"] )
def test_write_delta(tempdir): fname = os.path.join(tempdir, 'temp.parq') data = pd.DataFrame({'i1': np.arange(10, dtype=np.int32) + 2, 'i2': np.cumsum(np.random.randint( 0, 5, size=10)).astype(np.int32) + 2}) writer.write(fname, data, encoding="DELTA_BINARY_PACKED") df = sql.read.parquet(fname) ddf = df.toPandas() for col in data: assert (ddf[col] == data[col])[~ddf[col].isnull()].all()
def test_nulls_roundtrip(tempdir): fname = os.path.join(tempdir, "temp.parq") data = pd.DataFrame({"o": np.random.choice(["hello", "world", None], size=1000)}) data["cat"] = data["o"].astype("category") writer.write(fname, data, has_nulls=["o", "cat"]) r = ParquetFile(fname) df = r.to_pandas() for col in r.columns: assert (df[col] == data[col])[~data[col].isnull()].all() assert (data[col].isnull() == df[col].isnull()).all()
def test_nulls_roundtrip(tempdir): fname = os.path.join(tempdir, 'temp.parq') data = pd.DataFrame({'o': np.random.choice(['hello', 'world', None], size=1000)}) data['cat'] = data['o'].astype('category') writer.write(fname, data, has_nulls=['o', 'cat']) r = ParquetFile(fname) df = r.to_pandas() for col in r.columns: assert (df[col] == data[col])[~data[col].isnull()].all() assert (data[col].isnull() == df[col].isnull()).all()
def test_groups_iterable(tempdir): df = pd.DataFrame({'a': np.random.choice(['aaa', 'bbb', None], size=1000), 'b': np.random.randint(0, 64000, size=1000), 'c': np.random.choice([True, False], size=1000)}) writer.write(tempdir, df, partition_on=['a'], file_scheme='hive') r = ParquetFile(tempdir) assert r.columns == ['b', 'c'] out = r.to_pandas() for i, row in out.iterrows(): assert row.b in list(df[(df.a==row.a)&(df.c==row.c)].b)
def test_naive_index(tempdir): fn = os.path.join(tempdir, 'tmp.parq') df = pd.DataFrame({'x': [1, 2, 3], 'y': [1., 2., 3.]}) writer.write(fn, df) r = ParquetFile(fn) assert set(r.columns) == {'x', 'y'} writer.write(fn, df, write_index=True) r = ParquetFile(fn) assert set(r.columns) == {'x', 'y', 'index'}
def test_naive_index(tempdir): fn = os.path.join(tempdir, "tmp.parq") df = pd.DataFrame({"x": [1, 2, 3], "y": [1.0, 2.0, 3.0]}) writer.write(fn, df) r = ParquetFile(fn) assert set(r.columns) == {"x", "y"} writer.write(fn, df, write_index=True) r = ParquetFile(fn) assert set(r.columns) == {"x", "y", "index"}
def test_hasnulls_ordering(tempdir): fname = os.path.join(tempdir, 'temp.parq') data = pd.DataFrame({'a': np.random.rand(100), 'b': np.random.rand(100), 'c': np.random.rand(100)}) writer.write(fname, data, has_nulls=['a', 'c']) r = ParquetFile(fname) assert r._schema[1].name == 'a' assert r._schema[1].repetition_type == 1 assert r._schema[2].name == 'b' assert r._schema[2].repetition_type == 0 assert r._schema[3].name == 'c' assert r._schema[3].repetition_type == 1
def test_read_partitioned_and_write_with_empty_partions(tempdir): df = pd.DataFrame({'a': np.random.choice(['a', 'b', 'c'], size=1000), 'c': np.random.choice([True, False], size=1000)}) writer.write(tempdir, df, partition_on=['a'], file_scheme='hive') df_filtered = ParquetFile(tempdir).to_pandas( filters=[('a', '==', 'b')] ) writer.write(tempdir, df_filtered, partition_on=['a'], file_scheme='hive') df_loaded = ParquetFile(tempdir).to_pandas() tm.assert_frame_equal(df_filtered, df_loaded, check_categorical=False)
def test_empty_groupby(tempdir): df = pd.DataFrame({'a': np.random.choice(['a', 'b', None], size=1000), 'b': np.random.randint(0, 64000, size=1000), 'c': np.random.choice([True, False], size=1000)}) df.loc[499:, 'c'] = True # no False in second half writer.write(tempdir, df, partition_on=['a', 'c'], file_scheme='hive', row_group_offsets=[0, 500]) r = ParquetFile(tempdir) assert r.count() == sum(~df.a.isnull()) assert len(r.row_groups) == 6 out = r.to_pandas() for i, row in out.iterrows(): assert row.b in list(df[(df.a==row.a)&(df.c==row.c)].b)
def test_write_delta(tempdir): fname = os.path.join(tempdir, "temp.parq") data = pd.DataFrame( { "i1": np.arange(10, dtype=np.int32) + 2, "i2": np.cumsum(np.random.randint(0, 5, size=10)).astype(np.int32) + 2, } ) writer.write(fname, data, encoding="DELTA_BINARY_PACKED") df = sql.read.parquet(fname) ddf = df.toPandas() for col in data: assert (ddf[col] == data[col])[~ddf[col].isnull()].all()
def test_cats_in_part_files(tempdir): df = pd.DataFrame({'a': pd.Categorical(['a', 'b'] * 100)}) writer.write(tempdir, df, file_scheme='hive', row_group_offsets=50) import glob files = glob.glob(os.path.join(tempdir, 'part*')) pf = ParquetFile(tempdir) assert len(pf.row_groups) == 4 kv = pf.fmd.key_value_metadata assert kv for f in files: pf = ParquetFile(f) assert pf.fmd.key_value_metadata == kv assert len(pf.row_groups) == 1 out = pd.concat([ParquetFile(f).to_pandas() for f in files], ignore_index=True) pd.testing.assert_frame_equal(df, out)
def test_decimal_roundtrip(tempdir): import decimal def decimal_convert(x): return decimal.Decimal(x) fname = os.path.join(tempdir, 'decitemp.parq') data = pd.DataFrame({'f64': np.arange(10000000, 10001000, dtype=np.float64) / 100000, 'f16': np.arange(1000, dtype=np.float16) /10000 }) data['f64']=data['f64'].apply(decimal_convert) data['f16']=data['f16'].apply(decimal_convert) writer.write(fname, data) r = ParquetFile(fname) df = r.to_pandas() for col in r.columns: assert (data[col] == df[col]).all()
def test_empty_groupby(tempdir): df = pd.DataFrame( { "a": np.random.choice(["a", "b", None], size=1000), "b": np.random.randint(0, 64000, size=1000), "c": np.random.choice([True, False], size=1000), } ) df.loc[499:, "c"] = True # no False in second half writer.write(tempdir, df, partition_on=["a", "c"], file_scheme="hive", row_group_offsets=[0, 500]) r = ParquetFile(tempdir) assert r.count == sum(~df.a.isnull()) assert len(r.row_groups) == 6 out = r.to_pandas() for i, row in out.iterrows(): assert row.b in list(df[(df.a == row.a) & (df.c == row.c)].b)
def test_multi_index(tempdir): import json fn = os.path.join(tempdir, 'tmp.parq') idx = pd.MultiIndex.from_product([['a', 'b', 'c'], [1, 2, 3]]) idx.names = ['index0', 'index1'] df = pd.DataFrame(list(range(9)), idx, ['col']) writer.write(fn, df) pf = ParquetFile(fn) assert set(pf.columns) == {'col', 'index0', 'index1'} meta = json.loads(pf.key_value_metadata['pandas']) assert meta['index_columns'] == idx.names out = pf.to_pandas() assert out.index.names == idx.names pd.util.testing.assert_frame_equal(df, out) out = pf.to_pandas(index=False) assert out.index.name is None assert (out.index == range(9)).all() assert len(out.columns) == 3
def test_index(tempdir): import json fn = os.path.join(tempdir, 'tmp.parq') df = pd.DataFrame({'x': [1, 2, 3], 'y': [1., 2., 3.]}, index=pd.Index([10, 20, 30], name='z')) writer.write(fn, df) pf = ParquetFile(fn) assert set(pf.columns) == {'x', 'y', 'z'} meta = json.loads(pf.key_value_metadata['pandas']) assert meta['index_columns'] == ['z'] out = pf.to_pandas() assert out.index.name == 'z' pd.testing.assert_frame_equal(df, out, check_dtype=False) out = pf.to_pandas(index=False) assert out.index.name is None assert (out.index == range(3)).all() assert (out.z == df.index).all()
def test_int_rowgroups(tempdir): df = pd.DataFrame({'a': [1]*100}) fname = os.path.join(tempdir, 'test.parq') writer.write(fname, df, row_group_offsets=30) r = ParquetFile(fname) assert [rg.num_rows for rg in r.row_groups] == [25, 25, 25, 25] writer.write(fname, df, row_group_offsets=33) r = ParquetFile(fname) assert [rg.num_rows for rg in r.row_groups] == [25, 25, 25, 25] writer.write(fname, df, row_group_offsets=34) r = ParquetFile(fname) assert [rg.num_rows for rg in r.row_groups] == [34, 34, 32] writer.write(fname, df, row_group_offsets=35) r = ParquetFile(fname) assert [rg.num_rows for rg in r.row_groups] == [34, 34, 32]
def test_int_rowgroups(tempdir): df = pd.DataFrame({"a": [1] * 100}) fname = os.path.join(tempdir, "test.parq") writer.write(fname, df, row_group_offsets=30) r = ParquetFile(fname) assert [rg.num_rows for rg in r.row_groups] == [25, 25, 25, 25] writer.write(fname, df, row_group_offsets=33) r = ParquetFile(fname) assert [rg.num_rows for rg in r.row_groups] == [25, 25, 25, 25] writer.write(fname, df, row_group_offsets=34) r = ParquetFile(fname) assert [rg.num_rows for rg in r.row_groups] == [34, 34, 32] writer.write(fname, df, row_group_offsets=35) r = ParquetFile(fname) assert [rg.num_rows for rg in r.row_groups] == [34, 34, 32]
def test_empty_row_group(tempdir): fname = os.path.join(tempdir, "temp.parq") data = pd.DataFrame({"o": np.random.choice(["hello", "world"], size=1000)}) writer.write(fname, data, row_group_offsets=[0, 900, 1800]) pf = ParquetFile(fname) assert len(pf.row_groups) == 2