def test_sorted_row_group_columns_with_filters(tempdir): dd = pytest.importorskip('dask.dataframe') # create dummy dataframe df = pd.DataFrame( { 'unique': [0, 0, 1, 1, 2, 2, 3, 3], 'id': ['id1', 'id2', 'id1', 'id2', 'id1', 'id2', 'id1', 'id2'] }, index=[0, 0, 1, 1, 2, 2, 3, 3]) df = dd.from_pandas(df, npartitions=2) fn = os.path.join(tempdir, 'foo.parquet') df.to_parquet(fn, engine='fastparquet', partition_on=['id']) # load ParquetFile pf = ParquetFile(fn) filters = [('id', '==', 'id1')] # without filters no columns are sorted result = sorted_partitioned_columns(pf) expected = {} assert result == expected # with filters both columns are sorted result = sorted_partitioned_columns(pf, filters=filters) expected = { 'index': { 'min': [0, 2], 'max': [1, 3] }, 'unique': { 'min': [0, 2], 'max': [1, 3] } } assert result == expected
def test_sorted_row_group_columns_with_filters(tempdir): dd = pytest.importorskip('dask.dataframe') # create dummy dataframe df = pd.DataFrame({'unique': [0, 0, 1, 1, 2, 2, 3, 3], 'id': ['id1', 'id2', 'id1', 'id2', 'id1', 'id2', 'id1', 'id2']}, index=[0, 0, 1, 1, 2, 2, 3, 3]) df = dd.from_pandas(df, npartitions=2) fn = os.path.join(tempdir, 'foo.parquet') df.to_parquet(fn, engine='fastparquet', partition_on=['id']) # load ParquetFile pf = ParquetFile(fn) filters = [('id', '==', 'id1')] # without filters no columns are sorted result = sorted_partitioned_columns(pf) expected = {} assert result == expected # with filters both columns are sorted result = sorted_partitioned_columns(pf, filters=filters) expected = {'index': {'min': [0, 2], 'max': [1, 3]}, 'unique': {'min': [0, 2], 'max': [1, 3]}} assert result == expected
def test_sorted_row_group_columns(tempdir): df = pd.DataFrame({ 'x': [1, 2, 3, 4], 'y': [1.0, 2.0, 1.0, 2.0], 'z': ['a', 'b', 'c', 'd'] }) fn = os.path.join(tempdir, 'foo.parquet') write(fn, df, row_group_offsets=[0, 2]) pf = ParquetFile(fn) result = sorted_partitioned_columns(pf) expected = { 'x': { 'min': [1, 3], 'max': [2, 4] }, 'z': { 'min': ['a', 'c'], 'max': ['b', 'd'] } } assert result == expected
def test_int96_stats(tempdir): df = pd.util.testing.makeMixedDataFrame() fn = os.path.join(tempdir, 'foo.parquet') write(fn, df, row_group_offsets=[0, 2], times='int96') p = ParquetFile(fn) s = statistics(p) assert isinstance(s['min']['D'][0], (np.datetime64, Timestamp)) assert 'D' in sorted_partitioned_columns(p)
def test_int96_stats(tempdir): df = pd.util.testing.makeMixedDataFrame() fn = os.path.join(tempdir, 'foo.parquet') write(fn, df, row_group_offsets=[0, 2], times='int96') p = ParquetFile(fn) s = statistics(p) assert isinstance(s['min']['D'][0], (np.datetime64, pd.tslib.Timestamp)) assert 'D' in sorted_partitioned_columns(p)
def test_sorted_row_group_columns(tempdir): df = pd.DataFrame({'x': [1, 2, 3, 4], 'y': [1.0, 2.0, 1.0, 2.0], 'z': ['a', 'b', 'c', 'd']}) fn = os.path.join(tempdir, 'foo.parquet') write(fn, df, row_group_offsets=[0, 2]) pf = ParquetFile(fn) result = sorted_partitioned_columns(pf) expected = {'x': {'min': [1, 3], 'max': [2, 4]}, 'z': {'min': ['a', 'c'], 'max': ['b', 'd']}} assert result == expected
def test_sorted_row_group_columns(tempdir): df = pd.DataFrame({ 'x': [1, 2, 3, 4], 'v': [{ 'a': 0 }, { 'b': -1 }, { 'c': 5 }, { 'a': 0 }], 'y': [1.0, 2.0, 1.0, 2.0], 'z': ['a', 'b', 'c', 'd'] }) fn = os.path.join(tempdir, 'foo.parquet') write(fn, df, row_group_offsets=[0, 2], object_encoding={ 'v': 'json', 'z': 'utf8' }) pf = ParquetFile(fn) # string stats should be stored without byte-encoding zcol = [ c for c in pf.row_groups[0].columns if c.meta_data.path_in_schema == ['z'] ][0] assert zcol.meta_data.statistics.min == b'a' result = sorted_partitioned_columns(pf) expected = { 'x': { 'min': [1, 3], 'max': [2, 4] }, 'z': { 'min': ['a', 'c'], 'max': ['b', 'd'] } } # NB column v should not feature, as dict are unorderable assert result == expected
def test_sorted_row_group_columns(tempdir): df = pd.DataFrame({'x': [1, 2, 3, 4], 'v': [{'a': 0}, {'b': -1}, {'c': 5}, {'a': 0}], 'y': [1.0, 2.0, 1.0, 2.0], 'z': ['a', 'b', 'c', 'd']}) fn = os.path.join(tempdir, 'foo.parquet') write(fn, df, row_group_offsets=[0, 2], object_encoding={'v': 'json', 'z': 'utf8'}) pf = ParquetFile(fn) result = sorted_partitioned_columns(pf) expected = {'x': {'min': [1, 3], 'max': [2, 4]}, 'z': {'min': ['a', 'c'], 'max': ['b', 'd']}} # NB column v should not feature, as dict are unorderable assert result == expected
def test_sorted_row_group_columns(tempdir): df = pd.DataFrame({'x': [1, 2, 3, 4], 'v': [{'a': 0}, {'b': -1}, {'c': 5}, {'a': 0}], 'y': [1.0, 2.0, 1.0, 2.0], 'z': ['a', 'b', 'c', 'd']}) fn = os.path.join(tempdir, 'foo.parquet') write(fn, df, row_group_offsets=[0, 2], object_encoding={'v': 'json', 'z': 'utf8'}) pf = ParquetFile(fn) # string stats should be stored without byte-encoding zcol = [c for c in pf.row_groups[0].columns if c.meta_data.path_in_schema == ['z']][0] assert zcol.meta_data.statistics.min == b'a' result = sorted_partitioned_columns(pf) expected = {'x': {'min': [1, 3], 'max': [2, 4]}, 'z': {'min': ['a', 'c'], 'max': ['b', 'd']}} # NB column v should not feature, as dict are unorderable assert result == expected