Exemple #1
0
def _deserialize_column_index(block_table, all_columns, column_indexes):
    column_strings = [
        frombytes(x) if isinstance(x, bytes) else x
        for x in block_table.column_names
    ]
    if all_columns:
        columns_name_dict = {
            c.get('field_name', _column_name_to_strings(c['name'])): c['name']
            for c in all_columns
        }
        columns_values = [
            columns_name_dict.get(name, name) for name in column_strings
        ]
    else:
        columns_values = column_strings

    # If we're passed multiple column indexes then evaluate with
    # ast.literal_eval, since the column index values show up as a list of
    # tuples
    to_pair = ast.literal_eval if len(column_indexes) > 1 else lambda x: (x, )

    # Create the column index

    # Construct the base index
    if not columns_values:
        columns = _pandas_api.pd.Index(columns_values)
    else:
        columns = _pandas_api.pd.MultiIndex.from_tuples(
            list(map(to_pair, columns_values)),
            names=[col_index['name'] for col_index in column_indexes] or None,
        )

    # if we're reconstructing the index
    if len(column_indexes) > 0:
        columns = _reconstruct_columns_from_metadata(columns, column_indexes)

    # ARROW-1751: flatten a single level column MultiIndex for pandas 0.21.0
    columns = _flatten_single_level_multiindex(columns)

    return columns
Exemple #2
0

@parquet
@pytest.mark.parametrize(
    'data, dtype, min_value, max_value, null_count, num_values', [
        ([1, 2, 2, None, 4], np.uint8, u'1', u'4', 1, 4),
        ([1, 2, 2, None, 4], np.uint16, u'1', u'4', 1, 4),
        ([1, 2, 2, None, 4], np.uint32, u'1', u'4', 1, 4),
        ([1, 2, 2, None, 4], np.uint64, u'1', u'4', 1, 4),
        ([-1, 2, 2, None, 4], np.int16, u'-1', u'4', 1, 4),
        ([-1, 2, 2, None, 4], np.int32, u'-1', u'4', 1, 4),
        ([-1, 2, 2, None, 4], np.int64, u'-1', u'4', 1, 4),
        ([-1.1, 2.2, 2.3, None, 4.4], np.float32, u'-1.1', u'4.4', 1, 4),
        ([-1.1, 2.2, 2.3, None, 4.4], np.float64, u'-1.1', u'4.4', 1, 4),
        ([u'', u'b', unichar(1000), None, u'aaa'
          ], str, u' ', frombytes(
              (unichar(1000) + u' ').encode('utf-8')), 1, 4),
        ([True, False, False, True, True], np.bool, u'0', u'1', 0, 5),
    ])
def test_parquet_column_statistics_api(data, dtype, min_value, max_value,
                                       null_count, num_values):
    df = pd.DataFrame({'data': data}, dtype=dtype)

    fileh = make_sample_file(df)

    meta = fileh.metadata

    rg_meta = meta.row_group(0)
    col_meta = rg_meta.column(0)

    stat = col_meta.statistics
    assert stat.min == min_value
Exemple #3
0
@parquet
@pytest.mark.parametrize(
    'data, dtype, min_value, max_value, null_count, num_values',
    [
        ([1, 2, 2, None, 4], np.uint8, u'1', u'4', 1, 4),
        ([1, 2, 2, None, 4], np.uint16, u'1', u'4', 1, 4),
        ([1, 2, 2, None, 4], np.uint32, u'1', u'4', 1, 4),
        ([1, 2, 2, None, 4], np.uint64, u'1', u'4', 1, 4),
        ([-1, 2, 2, None, 4], np.int16, u'-1', u'4', 1, 4),
        ([-1, 2, 2, None, 4], np.int32, u'-1', u'4', 1, 4),
        ([-1, 2, 2, None, 4], np.int64, u'-1', u'4', 1, 4),
        ([-1.1, 2.2, 2.3, None, 4.4], np.float32, u'-1.1', u'4.4', 1, 4),
        ([-1.1, 2.2, 2.3, None, 4.4], np.float64, u'-1.1', u'4.4', 1, 4),
        (
            [u'', u'b', unichar(1000), None, u'aaa'],
            str, u' ', frombytes((unichar(1000) + u' ').encode('utf-8')), 1, 4
        ),
        ([True, False, False, True, True], np.bool, u'0', u'1', 0, 5),
    ]
)
def test_parquet_column_statistics_api(
        data,
        dtype,
        min_value,
        max_value,
        null_count,
        num_values):
    df = pd.DataFrame({'data': data}, dtype=dtype)

    fileh = make_sample_file(df)