def _reconstruct_columns_from_metadata(columns, column_indexes): # Part of table_to_blockmanager # Get levels and labels, and provide sane defaults if the index has a # single level to avoid if/else spaghetti. levels = getattr(columns, 'levels', None) or [columns] labels = getattr(columns, 'labels', None) or [ pd.RangeIndex(len(level)) for level in levels ] # Convert each level to the dtype provided in the metadata levels_dtypes = [ (level, col_index.get('numpy_type', level.dtype)) for level, col_index in zip_longest( levels, column_indexes, fillvalue={} ) ] new_levels = [ _level if _level.dtype == _dtype else _level.astype(_dtype) for _level, _dtype in levels_dtypes ] return pd.MultiIndex( levels=new_levels, labels=labels, names=columns.names )
def _reconstruct_columns_from_metadata(columns, column_indexes): """Construct a pandas MultiIndex from `columns` and column index metadata in `column_indexes`. Parameters ---------- columns : List[pd.Index] The columns coming from a pyarrow.Table column_indexes : List[Dict[str, str]] The column index metadata deserialized from the JSON schema metadata in a :class:`~pyarrow.Table`. Returns ------- result : MultiIndex The index reconstructed using `column_indexes` metadata with levels of the correct type. Notes ----- * Part of :func:`~pyarrow.pandas_compat.table_to_blockmanager` """ pd = _pandas_api.pd # Get levels and labels, and provide sane defaults if the index has a # single level to avoid if/else spaghetti. levels = getattr(columns, 'levels', None) or [columns] labels = _get_multiindex_codes(columns) or [ pd.RangeIndex(len(level)) for level in levels ] # Convert each level to the dtype provided in the metadata levels_dtypes = [ (level, col_index.get('pandas_type', str(level.dtype))) for level, col_index in zip_longest( levels, column_indexes, fillvalue={} ) ] new_levels = [] encoder = operator.methodcaller('encode', 'UTF-8') for level, pandas_dtype in levels_dtypes: dtype = _pandas_type_to_numpy_type(pandas_dtype) # Since our metadata is UTF-8 encoded, Python turns things that were # bytes into unicode strings when json.loads-ing them. We need to # convert them back to bytes to preserve metadata. if dtype == np.bytes_: level = level.map(encoder) elif level.dtype != dtype: level = level.astype(dtype) new_levels.append(level) return pd.MultiIndex(new_levels, labels, names=columns.names)
def _reconstruct_columns_from_metadata(columns, column_indexes): """Construct a pandas MultiIndex from `columns` and column index metadata in `column_indexes`. Parameters ---------- columns : List[pd.Index] The columns coming from a pyarrow.Table column_indexes : List[Dict[str, str]] The column index metadata deserialized from the JSON schema metadata in a :class:`~pyarrow.Table`. Returns ------- result : MultiIndex The index reconstructed using `column_indexes` metadata with levels of the correct type. Notes ----- * Part of :func:`~pyarrow.pandas_compat.table_to_blockmanager` """ # Get levels and labels, and provide sane defaults if the index has a # single level to avoid if/else spaghetti. levels = getattr(columns, 'levels', None) or [columns] labels = getattr(columns, 'labels', None) or [ pd.RangeIndex(len(level)) for level in levels ] # Convert each level to the dtype provided in the metadata levels_dtypes = [ (level, col_index.get('pandas_type', str(level.dtype))) for level, col_index in zip_longest( levels, column_indexes, fillvalue={} ) ] new_levels = [] encoder = operator.methodcaller('encode', 'UTF-8') for level, pandas_dtype in levels_dtypes: dtype = _pandas_type_to_numpy_type(pandas_dtype) # Since our metadata is UTF-8 encoded, Python turns things that were # bytes into unicode strings when json.loads-ing them. We need to # convert them back to bytes to preserve metadata. if dtype == np.bytes_: level = level.map(encoder) elif level.dtype != dtype: level = level.astype(dtype) new_levels.append(level) return pd.MultiIndex(levels=new_levels, labels=labels, names=columns.names)
def table_to_blockmanager(options, table, memory_pool, nthreads=1): import pandas.core.internals as _int import pyarrow.lib as lib index_columns = [] columns = [] column_indexes = [] index_arrays = [] index_names = [] schema = table.schema row_count = table.num_rows metadata = schema.metadata columns_metadata = None has_pandas_metadata = metadata is not None and b'pandas' in metadata if has_pandas_metadata: pandas_metadata = json.loads(metadata[b'pandas'].decode('utf8')) index_columns = pandas_metadata['index_columns'] columns = pandas_metadata['columns'] column_indexes = pandas_metadata.get('column_indexes', []) table = _add_any_metadata(table, pandas_metadata) columns_metadata = pandas_metadata.get('columns', None) block_table = table # Build up a list of index columns and names while removing those columns # from the original table logical_index_names = [c['name'] for c in columns[-len(index_columns):]] for raw_name, logical_name in zip(index_columns, logical_index_names): i = schema.get_field_index(raw_name) if i != -1: col = table.column(i) col_pandas = col.to_pandas() values = col_pandas.values if hasattr(values, 'flags') and not values.flags.writeable: # ARROW-1054: in pandas 0.19.2, factorize will reject # non-writeable arrays when calling MultiIndex.from_arrays values = values.copy() index_arrays.append(pd.Series(values, dtype=col_pandas.dtype)) index_names.append( backwards_compatible_index_name(raw_name, logical_name)) block_table = block_table.remove_column( block_table.schema.get_field_index(raw_name)) # Convert an arrow table to Block from the internal pandas API result = lib.table_to_blocks(options, block_table, nthreads, memory_pool) # Construct the individual blocks converting dictionary types to pandas # categorical types and Timestamps-with-timezones types to the proper # pandas Blocks blocks = [] for item in result: block_arr = item['block'] placement = item['placement'] if 'dictionary' in item: cat = pd.Categorical(block_arr, categories=item['dictionary'], ordered=item['ordered'], fastpath=True) block = _int.make_block(cat, placement=placement, klass=_int.CategoricalBlock, fastpath=True) elif 'timezone' in item: dtype = make_datetimetz(item['timezone']) block = _int.make_block(block_arr, placement=placement, klass=_int.DatetimeTZBlock, dtype=dtype, fastpath=True) else: block = _int.make_block(block_arr, placement=placement) blocks.append(block) # Construct the row index if len(index_arrays) > 1: index = pd.MultiIndex.from_arrays(index_arrays, names=index_names) elif len(index_arrays) == 1: index = pd.Index(index_arrays[0], name=index_names[0]) else: index = pd.RangeIndex(row_count) column_strings = [x.name for x in block_table.itercolumns()] if columns_metadata is not None: columns_name_dict = dict( (str(x['name']), x['name']) for x in columns_metadata) columns_values = [ columns_name_dict[y] if y in columns_name_dict.keys() else y for y in column_strings ] else: columns_values = column_strings # If we're passed multiple column indexes then evaluate with # ast.literal_eval, since the column index values show up as a list of # tuples to_pair = ast.literal_eval if len(column_indexes) > 1 else lambda x: (x, ) # Create the column index # Construct the base index if not columns_values: columns = pd.Index(columns_values) else: columns = pd.MultiIndex.from_tuples( list(map(to_pair, columns_values)), names=[col_index['name'] for col_index in column_indexes] or None, ) # if we're reconstructing the index if has_pandas_metadata: # Get levels and labels, and provide sane defaults if the index has a # single level to avoid if/else spaghetti. levels = getattr(columns, 'levels', None) or [columns] labels = getattr(columns, 'labels', None) or [ pd.RangeIndex(len(level)) for level in levels ] # Convert each level to the dtype provided in the metadata levels_dtypes = [(level, col_index.get('numpy_type', level.dtype)) for level, col_index in zip_longest( levels, column_indexes, fillvalue={})] new_levels = [ _level if _level.dtype == _dtype else _level.astype(_dtype) for _level, _dtype in levels_dtypes ] columns = pd.MultiIndex(levels=new_levels, labels=labels, names=columns.names) # ARROW-1751: flatten a single level column MultiIndex for pandas 0.21.0 columns = _flatten_single_level_multiindex(columns) axes = [columns, index] return _int.BlockManager(blocks, axes)
def table_to_blockmanager(options, table, memory_pool, nthreads=1): import pandas.core.internals as _int import pyarrow.lib as lib index_columns = [] column_indexes = [] index_arrays = [] index_names = [] schema = table.schema row_count = table.num_rows metadata = schema.metadata has_pandas_metadata = metadata is not None and b'pandas' in metadata if has_pandas_metadata: pandas_metadata = json.loads(metadata[b'pandas'].decode('utf8')) index_columns = pandas_metadata['index_columns'] column_indexes = pandas_metadata.get('column_indexes', []) table = _add_any_metadata(table, pandas_metadata) block_table = table # Build up a list of index columns and names while removing those columns # from the original table for name in index_columns: i = schema.get_field_index(name) if i != -1: col = table.column(i) index_name = None if is_unnamed_index_level(name) else name col_pandas = col.to_pandas() values = col_pandas.values if not values.flags.writeable: # ARROW-1054: in pandas 0.19.2, factorize will reject # non-writeable arrays when calling MultiIndex.from_arrays values = values.copy() index_arrays.append(pd.Series(values, dtype=col_pandas.dtype)) index_names.append(index_name) block_table = block_table.remove_column( block_table.schema.get_field_index(name) ) # Convert an arrow table to Block from the internal pandas API result = lib.table_to_blocks(options, block_table, nthreads, memory_pool) # Construct the individual blocks converting dictionary types to pandas # categorical types and Timestamps-with-timezones types to the proper # pandas Blocks blocks = [] for item in result: block_arr = item['block'] placement = item['placement'] if 'dictionary' in item: cat = pd.Categorical(block_arr, categories=item['dictionary'], ordered=item['ordered'], fastpath=True) block = _int.make_block(cat, placement=placement, klass=_int.CategoricalBlock, fastpath=True) elif 'timezone' in item: dtype = make_datetimetz(item['timezone']) block = _int.make_block(block_arr, placement=placement, klass=_int.DatetimeTZBlock, dtype=dtype, fastpath=True) else: block = _int.make_block(block_arr, placement=placement) blocks.append(block) # Construct the row index if len(index_arrays) > 1: index = pd.MultiIndex.from_arrays(index_arrays, names=index_names) elif len(index_arrays) == 1: index = pd.Index(index_arrays[0], name=index_names[0]) else: index = pd.RangeIndex(row_count) column_strings = [x.name for x in block_table.itercolumns()] # If we're passed multiple column indexes then evaluate with # ast.literal_eval, since the column index values show up as a list of # tuples to_pair = ast.literal_eval if len(column_indexes) > 1 else lambda x: (x,) # Create the column index # Construct the base index if not column_strings: columns = pd.Index(column_strings) else: columns = pd.MultiIndex.from_tuples( list(map(to_pair, column_strings)), names=[col_index['name'] for col_index in column_indexes] or None, ) # if we're reconstructing the index if has_pandas_metadata: # Get levels and labels, and provide sane defaults if the index has a # single level to avoid if/else spaghetti. levels = getattr(columns, 'levels', None) or [columns] labels = getattr(columns, 'labels', None) or [ pd.RangeIndex(len(level)) for level in levels ] # Convert each level to the dtype provided in the metadata levels_dtypes = [ (level, col_index.get('numpy_type', level.dtype)) for level, col_index in zip_longest( levels, column_indexes, fillvalue={} ) ] new_levels = [ _level if _level.dtype == _dtype else _level.astype(_dtype) for _level, _dtype in levels_dtypes ] columns = pd.MultiIndex( levels=new_levels, labels=labels, names=columns.names ) axes = [columns, index] return _int.BlockManager(blocks, axes)