def _deserialize_column_index(block_table, all_columns, column_indexes): column_strings = [ frombytes(x) if isinstance(x, bytes) else x for x in block_table.column_names ] if all_columns: columns_name_dict = { c.get('field_name', _column_name_to_strings(c['name'])): c['name'] for c in all_columns } columns_values = [ columns_name_dict.get(name, name) for name in column_strings ] else: columns_values = column_strings # If we're passed multiple column indexes then evaluate with # ast.literal_eval, since the column index values show up as a list of # tuples to_pair = ast.literal_eval if len(column_indexes) > 1 else lambda x: (x, ) # Create the column index # Construct the base index if not columns_values: columns = _pandas_api.pd.Index(columns_values) else: columns = _pandas_api.pd.MultiIndex.from_tuples( list(map(to_pair, columns_values)), names=[col_index['name'] for col_index in column_indexes] or None, ) # if we're reconstructing the index if len(column_indexes) > 0: columns = _reconstruct_columns_from_metadata(columns, column_indexes) # ARROW-1751: flatten a single level column MultiIndex for pandas 0.21.0 columns = _flatten_single_level_multiindex(columns) return columns
@parquet @pytest.mark.parametrize( 'data, dtype, min_value, max_value, null_count, num_values', [ ([1, 2, 2, None, 4], np.uint8, u'1', u'4', 1, 4), ([1, 2, 2, None, 4], np.uint16, u'1', u'4', 1, 4), ([1, 2, 2, None, 4], np.uint32, u'1', u'4', 1, 4), ([1, 2, 2, None, 4], np.uint64, u'1', u'4', 1, 4), ([-1, 2, 2, None, 4], np.int16, u'-1', u'4', 1, 4), ([-1, 2, 2, None, 4], np.int32, u'-1', u'4', 1, 4), ([-1, 2, 2, None, 4], np.int64, u'-1', u'4', 1, 4), ([-1.1, 2.2, 2.3, None, 4.4], np.float32, u'-1.1', u'4.4', 1, 4), ([-1.1, 2.2, 2.3, None, 4.4], np.float64, u'-1.1', u'4.4', 1, 4), ([u'', u'b', unichar(1000), None, u'aaa' ], str, u' ', frombytes( (unichar(1000) + u' ').encode('utf-8')), 1, 4), ([True, False, False, True, True], np.bool, u'0', u'1', 0, 5), ]) def test_parquet_column_statistics_api(data, dtype, min_value, max_value, null_count, num_values): df = pd.DataFrame({'data': data}, dtype=dtype) fileh = make_sample_file(df) meta = fileh.metadata rg_meta = meta.row_group(0) col_meta = rg_meta.column(0) stat = col_meta.statistics assert stat.min == min_value
@parquet @pytest.mark.parametrize( 'data, dtype, min_value, max_value, null_count, num_values', [ ([1, 2, 2, None, 4], np.uint8, u'1', u'4', 1, 4), ([1, 2, 2, None, 4], np.uint16, u'1', u'4', 1, 4), ([1, 2, 2, None, 4], np.uint32, u'1', u'4', 1, 4), ([1, 2, 2, None, 4], np.uint64, u'1', u'4', 1, 4), ([-1, 2, 2, None, 4], np.int16, u'-1', u'4', 1, 4), ([-1, 2, 2, None, 4], np.int32, u'-1', u'4', 1, 4), ([-1, 2, 2, None, 4], np.int64, u'-1', u'4', 1, 4), ([-1.1, 2.2, 2.3, None, 4.4], np.float32, u'-1.1', u'4.4', 1, 4), ([-1.1, 2.2, 2.3, None, 4.4], np.float64, u'-1.1', u'4.4', 1, 4), ( [u'', u'b', unichar(1000), None, u'aaa'], str, u' ', frombytes((unichar(1000) + u' ').encode('utf-8')), 1, 4 ), ([True, False, False, True, True], np.bool, u'0', u'1', 0, 5), ] ) def test_parquet_column_statistics_api( data, dtype, min_value, max_value, null_count, num_values): df = pd.DataFrame({'data': data}, dtype=dtype) fileh = make_sample_file(df)