def table_to_blockmanager(options, table, memory_pool, nthreads=1): import pandas.core.internals as _int import pyarrow.lib as lib index_columns = [] columns = [] column_indexes = [] index_arrays = [] index_names = [] schema = table.schema row_count = table.num_rows metadata = schema.metadata columns_metadata = None has_pandas_metadata = metadata is not None and b'pandas' in metadata if has_pandas_metadata: pandas_metadata = json.loads(metadata[b'pandas'].decode('utf8')) index_columns = pandas_metadata['index_columns'] columns = pandas_metadata['columns'] column_indexes = pandas_metadata.get('column_indexes', []) table = _add_any_metadata(table, pandas_metadata) columns_metadata = pandas_metadata.get('columns', None) block_table = table # Build up a list of index columns and names while removing those columns # from the original table logical_index_names = [c['name'] for c in columns[-len(index_columns):]] for raw_name, logical_name in zip(index_columns, logical_index_names): i = schema.get_field_index(raw_name) if i != -1: col = table.column(i) col_pandas = col.to_pandas() values = col_pandas.values if hasattr(values, 'flags') and not values.flags.writeable: # ARROW-1054: in pandas 0.19.2, factorize will reject # non-writeable arrays when calling MultiIndex.from_arrays values = values.copy() index_arrays.append(pd.Series(values, dtype=col_pandas.dtype)) index_names.append( backwards_compatible_index_name(raw_name, logical_name)) block_table = block_table.remove_column( block_table.schema.get_field_index(raw_name)) # Convert an arrow table to Block from the internal pandas API result = lib.table_to_blocks(options, block_table, nthreads, memory_pool) # Construct the individual blocks converting dictionary types to pandas # categorical types and Timestamps-with-timezones types to the proper # pandas Blocks blocks = [] for item in result: block_arr = item['block'] placement = item['placement'] if 'dictionary' in item: cat = pd.Categorical(block_arr, categories=item['dictionary'], ordered=item['ordered'], fastpath=True) block = _int.make_block(cat, placement=placement, klass=_int.CategoricalBlock, fastpath=True) elif 'timezone' in item: dtype = make_datetimetz(item['timezone']) block = _int.make_block(block_arr, placement=placement, klass=_int.DatetimeTZBlock, dtype=dtype, fastpath=True) else: block = _int.make_block(block_arr, placement=placement) blocks.append(block) # Construct the row index if len(index_arrays) > 1: index = pd.MultiIndex.from_arrays(index_arrays, names=index_names) elif len(index_arrays) == 1: index = pd.Index(index_arrays[0], name=index_names[0]) else: index = pd.RangeIndex(row_count) column_strings = [x.name for x in block_table.itercolumns()] if columns_metadata is not None: columns_name_dict = dict( (str(x['name']), x['name']) for x in columns_metadata) columns_values = [ columns_name_dict[y] if y in columns_name_dict.keys() else y for y in column_strings ] else: columns_values = column_strings # If we're passed multiple column indexes then evaluate with # ast.literal_eval, since the column index values show up as a list of # tuples to_pair = ast.literal_eval if len(column_indexes) > 1 else lambda x: (x, ) # Create the column index # Construct the base index if not columns_values: columns = pd.Index(columns_values) else: columns = pd.MultiIndex.from_tuples( list(map(to_pair, columns_values)), names=[col_index['name'] for col_index in column_indexes] or None, ) # if we're reconstructing the index if has_pandas_metadata: # Get levels and labels, and provide sane defaults if the index has a # single level to avoid if/else spaghetti. levels = getattr(columns, 'levels', None) or [columns] labels = getattr(columns, 'labels', None) or [ pd.RangeIndex(len(level)) for level in levels ] # Convert each level to the dtype provided in the metadata levels_dtypes = [(level, col_index.get('numpy_type', level.dtype)) for level, col_index in zip_longest( levels, column_indexes, fillvalue={})] new_levels = [ _level if _level.dtype == _dtype else _level.astype(_dtype) for _level, _dtype in levels_dtypes ] columns = pd.MultiIndex(levels=new_levels, labels=labels, names=columns.names) # ARROW-1751: flatten a single level column MultiIndex for pandas 0.21.0 columns = _flatten_single_level_multiindex(columns) axes = [columns, index] return _int.BlockManager(blocks, axes)
def table_to_blockmanager(table, nthreads=1): import pandas.core.internals as _int from pyarrow.compat import DatetimeTZDtype import pyarrow.lib as lib block_table = table index_columns = [] index_arrays = [] index_names = [] schema = table.schema row_count = table.num_rows metadata = schema.metadata if metadata is not None and b'pandas' in metadata: pandas_metadata = json.loads(metadata[b'pandas'].decode('utf8')) index_columns = pandas_metadata['index_columns'] for name in index_columns: i = schema.get_field_index(name) if i != -1: col = table.column(i) index_name = (None if is_unnamed_index_level(name) else name) values = col.to_pandas().values if not values.flags.writeable: # ARROW-1054: in pandas 0.19.2, factorize will reject # non-writeable arrays when calling MultiIndex.from_arrays values = values.copy() index_arrays.append(values) index_names.append(index_name) block_table = block_table.remove_column( block_table.schema.get_field_index(name) ) result = lib.table_to_blocks(block_table, nthreads) blocks = [] for item in result: block_arr = item['block'] placement = item['placement'] if 'dictionary' in item: cat = pd.Categorical(block_arr, categories=item['dictionary'], ordered=False, fastpath=True) block = _int.make_block(cat, placement=placement, klass=_int.CategoricalBlock, fastpath=True) elif 'timezone' in item: dtype = DatetimeTZDtype('ns', tz=item['timezone']) block = _int.make_block(block_arr, placement=placement, klass=_int.DatetimeTZBlock, dtype=dtype, fastpath=True) else: block = _int.make_block(block_arr, placement=placement) blocks.append(block) if len(index_arrays) > 1: index = pd.MultiIndex.from_arrays(index_arrays, names=index_names) elif len(index_arrays) == 1: index = pd.Index(index_arrays[0], name=index_names[0]) else: index = pd.RangeIndex(row_count) axes = [ [column.name for column in block_table.itercolumns()], index ] return _int.BlockManager(blocks, axes)
def table_to_blockmanager(table, nthreads=1): import pandas.core.internals as _int from pyarrow.compat import DatetimeTZDtype import pyarrow.lib as lib block_table = table index_columns = [] index_arrays = [] index_names = [] schema = table.schema row_count = table.num_rows metadata = schema.metadata if metadata is not None and b'pandas' in metadata: pandas_metadata = json.loads(metadata[b'pandas'].decode('utf8')) index_columns = pandas_metadata['index_columns'] for name in index_columns: i = schema.get_field_index(name) if i != -1: col = table.column(i) index_name = (None if is_unnamed_index_level(name) else name) values = col.to_pandas().values if not values.flags.writeable: # ARROW-1054: in pandas 0.19.2, factorize will reject # non-writeable arrays when calling MultiIndex.from_arrays values = values.copy() index_arrays.append(values) index_names.append(index_name) block_table = block_table.remove_column( block_table.schema.get_field_index(name)) result = lib.table_to_blocks(block_table, nthreads) blocks = [] for item in result: block_arr = item['block'] placement = item['placement'] if 'dictionary' in item: cat = pd.Categorical(block_arr, categories=item['dictionary'], ordered=False, fastpath=True) block = _int.make_block(cat, placement=placement, klass=_int.CategoricalBlock, fastpath=True) elif 'timezone' in item: dtype = DatetimeTZDtype('ns', tz=item['timezone']) block = _int.make_block(block_arr, placement=placement, klass=_int.DatetimeTZBlock, dtype=dtype, fastpath=True) else: block = _int.make_block(block_arr, placement=placement) blocks.append(block) if len(index_arrays) > 1: index = pd.MultiIndex.from_arrays(index_arrays, names=index_names) elif len(index_arrays) == 1: index = pd.Index(index_arrays[0], name=index_names[0]) else: index = pd.RangeIndex(row_count) axes = [[column.name for column in block_table.itercolumns()], index] return _int.BlockManager(blocks, axes)
def table_to_blockmanager(options, table, memory_pool, nthreads=1): import pandas.core.internals as _int import pyarrow.lib as lib index_columns = [] column_indexes = [] index_arrays = [] index_names = [] schema = table.schema row_count = table.num_rows metadata = schema.metadata has_pandas_metadata = metadata is not None and b'pandas' in metadata if has_pandas_metadata: pandas_metadata = json.loads(metadata[b'pandas'].decode('utf8')) index_columns = pandas_metadata['index_columns'] column_indexes = pandas_metadata.get('column_indexes', []) table = _add_any_metadata(table, pandas_metadata) block_table = table # Build up a list of index columns and names while removing those columns # from the original table for name in index_columns: i = schema.get_field_index(name) if i != -1: col = table.column(i) index_name = None if is_unnamed_index_level(name) else name col_pandas = col.to_pandas() values = col_pandas.values if not values.flags.writeable: # ARROW-1054: in pandas 0.19.2, factorize will reject # non-writeable arrays when calling MultiIndex.from_arrays values = values.copy() index_arrays.append(pd.Series(values, dtype=col_pandas.dtype)) index_names.append(index_name) block_table = block_table.remove_column( block_table.schema.get_field_index(name) ) # Convert an arrow table to Block from the internal pandas API result = lib.table_to_blocks(options, block_table, nthreads, memory_pool) # Construct the individual blocks converting dictionary types to pandas # categorical types and Timestamps-with-timezones types to the proper # pandas Blocks blocks = [] for item in result: block_arr = item['block'] placement = item['placement'] if 'dictionary' in item: cat = pd.Categorical(block_arr, categories=item['dictionary'], ordered=item['ordered'], fastpath=True) block = _int.make_block(cat, placement=placement, klass=_int.CategoricalBlock, fastpath=True) elif 'timezone' in item: dtype = make_datetimetz(item['timezone']) block = _int.make_block(block_arr, placement=placement, klass=_int.DatetimeTZBlock, dtype=dtype, fastpath=True) else: block = _int.make_block(block_arr, placement=placement) blocks.append(block) # Construct the row index if len(index_arrays) > 1: index = pd.MultiIndex.from_arrays(index_arrays, names=index_names) elif len(index_arrays) == 1: index = pd.Index(index_arrays[0], name=index_names[0]) else: index = pd.RangeIndex(row_count) column_strings = [x.name for x in block_table.itercolumns()] # If we're passed multiple column indexes then evaluate with # ast.literal_eval, since the column index values show up as a list of # tuples to_pair = ast.literal_eval if len(column_indexes) > 1 else lambda x: (x,) # Create the column index # Construct the base index if not column_strings: columns = pd.Index(column_strings) else: columns = pd.MultiIndex.from_tuples( list(map(to_pair, column_strings)), names=[col_index['name'] for col_index in column_indexes] or None, ) # if we're reconstructing the index if has_pandas_metadata: # Get levels and labels, and provide sane defaults if the index has a # single level to avoid if/else spaghetti. levels = getattr(columns, 'levels', None) or [columns] labels = getattr(columns, 'labels', None) or [ pd.RangeIndex(len(level)) for level in levels ] # Convert each level to the dtype provided in the metadata levels_dtypes = [ (level, col_index.get('numpy_type', level.dtype)) for level, col_index in zip_longest( levels, column_indexes, fillvalue={} ) ] new_levels = [ _level if _level.dtype == _dtype else _level.astype(_dtype) for _level, _dtype in levels_dtypes ] columns = pd.MultiIndex( levels=new_levels, labels=labels, names=columns.names ) axes = [columns, index] return _int.BlockManager(blocks, axes)