Esempio n. 1
0
def test_sorted_row_group_columns_with_filters(tempdir):
    dd = pytest.importorskip('dask.dataframe')
    # create dummy dataframe
    df = pd.DataFrame(
        {
            'unique': [0, 0, 1, 1, 2, 2, 3, 3],
            'id': ['id1', 'id2', 'id1', 'id2', 'id1', 'id2', 'id1', 'id2']
        },
        index=[0, 0, 1, 1, 2, 2, 3, 3])
    df = dd.from_pandas(df, npartitions=2)
    fn = os.path.join(tempdir, 'foo.parquet')
    df.to_parquet(fn, engine='fastparquet', partition_on=['id'])
    # load ParquetFile
    pf = ParquetFile(fn)
    filters = [('id', '==', 'id1')]

    # without filters no columns are sorted
    result = sorted_partitioned_columns(pf)
    expected = {}
    assert result == expected

    # with filters both columns are sorted
    result = sorted_partitioned_columns(pf, filters=filters)
    expected = {
        'index': {
            'min': [0, 2],
            'max': [1, 3]
        },
        'unique': {
            'min': [0, 2],
            'max': [1, 3]
        }
    }
    assert result == expected
Esempio n. 2
0
def test_sorted_row_group_columns_with_filters(tempdir):
    dd = pytest.importorskip('dask.dataframe')
    # create dummy dataframe
    df = pd.DataFrame({'unique': [0, 0, 1, 1, 2, 2, 3, 3],
                       'id': ['id1', 'id2',
                              'id1', 'id2',
                              'id1', 'id2',
                              'id1', 'id2']},
                      index=[0, 0, 1, 1, 2, 2, 3, 3])
    df = dd.from_pandas(df, npartitions=2)
    fn = os.path.join(tempdir, 'foo.parquet')
    df.to_parquet(fn,
                  engine='fastparquet',
                  partition_on=['id'])
    # load ParquetFile
    pf = ParquetFile(fn)
    filters = [('id', '==', 'id1')]

    # without filters no columns are sorted
    result = sorted_partitioned_columns(pf)
    expected = {}
    assert result == expected

    # with filters both columns are sorted
    result = sorted_partitioned_columns(pf, filters=filters)
    expected = {'index': {'min': [0, 2], 'max': [1, 3]},
                'unique': {'min': [0, 2], 'max': [1, 3]}}
    assert result == expected
Esempio n. 3
0
def test_sorted_row_group_columns(tempdir):
    df = pd.DataFrame({
        'x': [1, 2, 3, 4],
        'y': [1.0, 2.0, 1.0, 2.0],
        'z': ['a', 'b', 'c', 'd']
    })

    fn = os.path.join(tempdir, 'foo.parquet')
    write(fn, df, row_group_offsets=[0, 2])

    pf = ParquetFile(fn)

    result = sorted_partitioned_columns(pf)
    expected = {
        'x': {
            'min': [1, 3],
            'max': [2, 4]
        },
        'z': {
            'min': ['a', 'c'],
            'max': ['b', 'd']
        }
    }

    assert result == expected
Esempio n. 4
0
def test_int96_stats(tempdir):
    df = pd.util.testing.makeMixedDataFrame()

    fn = os.path.join(tempdir, 'foo.parquet')
    write(fn, df, row_group_offsets=[0, 2], times='int96')

    p = ParquetFile(fn)

    s = statistics(p)
    assert isinstance(s['min']['D'][0], (np.datetime64, Timestamp))
    assert 'D' in sorted_partitioned_columns(p)
Esempio n. 5
0
def test_int96_stats(tempdir):
    df = pd.util.testing.makeMixedDataFrame()

    fn = os.path.join(tempdir, 'foo.parquet')
    write(fn, df, row_group_offsets=[0, 2], times='int96')

    p = ParquetFile(fn)

    s = statistics(p)
    assert isinstance(s['min']['D'][0], (np.datetime64, pd.tslib.Timestamp))
    assert 'D' in sorted_partitioned_columns(p)
Esempio n. 6
0
def test_sorted_row_group_columns(tempdir):
    df = pd.DataFrame({'x': [1, 2, 3, 4],
                       'y': [1.0, 2.0, 1.0, 2.0],
                       'z': ['a', 'b', 'c', 'd']})

    fn = os.path.join(tempdir, 'foo.parquet')
    write(fn, df, row_group_offsets=[0, 2])

    pf = ParquetFile(fn)

    result = sorted_partitioned_columns(pf)
    expected = {'x': {'min': [1, 3], 'max': [2, 4]},
                'z': {'min': ['a', 'c'], 'max': ['b', 'd']}}

    assert result == expected
Esempio n. 7
0
def test_sorted_row_group_columns(tempdir):
    df = pd.DataFrame({
        'x': [1, 2, 3, 4],
        'v': [{
            'a': 0
        }, {
            'b': -1
        }, {
            'c': 5
        }, {
            'a': 0
        }],
        'y': [1.0, 2.0, 1.0, 2.0],
        'z': ['a', 'b', 'c', 'd']
    })

    fn = os.path.join(tempdir, 'foo.parquet')
    write(fn,
          df,
          row_group_offsets=[0, 2],
          object_encoding={
              'v': 'json',
              'z': 'utf8'
          })

    pf = ParquetFile(fn)

    # string stats should be stored without byte-encoding
    zcol = [
        c for c in pf.row_groups[0].columns
        if c.meta_data.path_in_schema == ['z']
    ][0]
    assert zcol.meta_data.statistics.min == b'a'

    result = sorted_partitioned_columns(pf)
    expected = {
        'x': {
            'min': [1, 3],
            'max': [2, 4]
        },
        'z': {
            'min': ['a', 'c'],
            'max': ['b', 'd']
        }
    }

    # NB column v should not feature, as dict are unorderable
    assert result == expected
Esempio n. 8
0
def test_sorted_row_group_columns(tempdir):
    df = pd.DataFrame({'x': [1, 2, 3, 4],
                       'v': [{'a': 0}, {'b': -1}, {'c': 5}, {'a': 0}],
                       'y': [1.0, 2.0, 1.0, 2.0],
                       'z': ['a', 'b', 'c', 'd']})

    fn = os.path.join(tempdir, 'foo.parquet')
    write(fn, df, row_group_offsets=[0, 2], object_encoding={'v': 'json',
                                                             'z': 'utf8'})

    pf = ParquetFile(fn)

    result = sorted_partitioned_columns(pf)
    expected = {'x': {'min': [1, 3], 'max': [2, 4]},
                'z': {'min': ['a', 'c'], 'max': ['b', 'd']}}

    # NB column v should not feature, as dict are unorderable
    assert result == expected
Esempio n. 9
0
def test_sorted_row_group_columns(tempdir):
    df = pd.DataFrame({'x': [1, 2, 3, 4],
                       'v': [{'a': 0}, {'b': -1}, {'c': 5}, {'a': 0}],
                       'y': [1.0, 2.0, 1.0, 2.0],
                       'z': ['a', 'b', 'c', 'd']})

    fn = os.path.join(tempdir, 'foo.parquet')
    write(fn, df, row_group_offsets=[0, 2], object_encoding={'v': 'json',
                                                             'z': 'utf8'})

    pf = ParquetFile(fn)

    # string stats should be stored without byte-encoding
    zcol = [c for c in pf.row_groups[0].columns
            if c.meta_data.path_in_schema == ['z']][0]
    assert zcol.meta_data.statistics.min == b'a'

    result = sorted_partitioned_columns(pf)
    expected = {'x': {'min': [1, 3], 'max': [2, 4]},
                'z': {'min': ['a', 'c'], 'max': ['b', 'd']}}

    # NB column v should not feature, as dict are unorderable
    assert result == expected