def serialized_dict_to_dataframe(data): reconstructed_blocks = [ _reconstruct_block(block) for block in data['blocks'] ] block_mgr = _int.BlockManager(reconstructed_blocks, data['axes']) return pd.DataFrame(block_mgr)
def serialized_dict_to_dataframe(data): import pandas.core.internals as _int reconstructed_blocks = [_reconstruct_block(block) for block in data['blocks']] block_mgr = _int.BlockManager(reconstructed_blocks, data['axes']) return _pandas_api.data_frame(block_mgr)
def table_to_blockmanager(options, table, categories=None, ignore_metadata=False): all_columns = [] column_indexes = [] pandas_metadata = table.schema.pandas_metadata if not ignore_metadata and pandas_metadata is not None: all_columns = pandas_metadata['columns'] column_indexes = pandas_metadata.get('column_indexes', []) index_descriptors = pandas_metadata['index_columns'] table = _add_any_metadata(table, pandas_metadata) table, index = _reconstruct_index(table, index_descriptors, all_columns) else: index = pd.RangeIndex(table.num_rows) _check_data_column_metadata_consistency(all_columns) blocks = _table_to_blocks(options, table, pa.default_memory_pool(), categories) columns = _deserialize_column_index(table, all_columns, column_indexes) axes = [columns, index] return _int.BlockManager(blocks, axes)
def table_to_blockmanager(options, table, memory_pool, nthreads=1, categoricals=None): from pyarrow.compat import DatetimeTZDtype index_columns = [] columns = [] column_indexes = [] index_arrays = [] index_names = [] schema = table.schema row_count = table.num_rows metadata = schema.metadata has_pandas_metadata = metadata is not None and b'pandas' in metadata if has_pandas_metadata: pandas_metadata = json.loads(metadata[b'pandas'].decode('utf8')) index_columns = pandas_metadata['index_columns'] columns = pandas_metadata['columns'] column_indexes = pandas_metadata.get('column_indexes', []) table = _add_any_metadata(table, pandas_metadata) block_table = table index_columns_set = frozenset(index_columns) # 0. 'field_name' is the name of the column in the arrow Table # 1. 'name' is the user-facing name of the column, that is, it came from # pandas # 2. 'field_name' and 'name' differ for index columns # 3. We fall back on c['name'] for backwards compatibility logical_index_names = [ c['name'] for c in columns if c.get('field_name', c['name']) in index_columns_set ] # There must be the same number of field names and physical names # (fields in the arrow Table) assert len(logical_index_names) == len(index_columns_set) # It can never be the case in a released version of pyarrow that # c['name'] is None *and* 'field_name' is not a key in the column metadata, # because the change to allow c['name'] to be None and the change to add # 'field_name' are in the same release (0.8.0) assert all( (c['name'] is None and 'field_name' in c) or c['name'] is not None for c in columns) # Build up a list of index columns and names while removing those columns # from the original table for raw_name, logical_name in zip(index_columns, logical_index_names): i = schema.get_field_index(raw_name) if i != -1: col = table.column(i) col_pandas = col.to_pandas() values = col_pandas.values if hasattr(values, 'flags') and not values.flags.writeable: # ARROW-1054: in pandas 0.19.2, factorize will reject # non-writeable arrays when calling MultiIndex.from_arrays values = values.copy() if isinstance(col_pandas.dtype, DatetimeTZDtype): index_array = ( pd.Series(values).dt.tz_localize('utc').dt.tz_convert( col_pandas.dtype.tz)) else: index_array = pd.Series(values, dtype=col_pandas.dtype) index_arrays.append(index_array) index_names.append( _backwards_compatible_index_name(raw_name, logical_name)) block_table = block_table.remove_column( block_table.schema.get_field_index(raw_name)) blocks = _table_to_blocks(options, block_table, nthreads, memory_pool) # Construct the row index if len(index_arrays) > 1: index = pd.MultiIndex.from_arrays(index_arrays, names=index_names) elif len(index_arrays) == 1: index = pd.Index(index_arrays[0], name=index_names[0]) else: index = pd.RangeIndex(row_count) column_strings = [x.name for x in block_table.itercolumns()] if columns: columns_name_dict = { c.get('field_name', _column_name_to_strings(c['name'])): c['name'] for c in columns } columns_values = [ columns_name_dict.get(name, name) for name in column_strings ] else: columns_values = column_strings # If we're passed multiple column indexes then evaluate with # ast.literal_eval, since the column index values show up as a list of # tuples to_pair = ast.literal_eval if len(column_indexes) > 1 else lambda x: (x, ) # Create the column index # Construct the base index if not columns_values: columns = pd.Index(columns_values) else: columns = pd.MultiIndex.from_tuples( list(map(to_pair, columns_values)), names=[col_index['name'] for col_index in column_indexes] or None, ) # if we're reconstructing the index if has_pandas_metadata: columns = _reconstruct_columns_from_metadata(columns, column_indexes) # ARROW-1751: flatten a single level column MultiIndex for pandas 0.21.0 columns = _flatten_single_level_multiindex(columns) axes = [columns, index] return _int.BlockManager(blocks, axes)
def table_to_blockmanager(options, table, memory_pool, nthreads=1): import pandas.core.internals as _int import pyarrow.lib as lib index_columns = [] columns = [] column_indexes = [] index_arrays = [] index_names = [] schema = table.schema row_count = table.num_rows metadata = schema.metadata columns_metadata = None has_pandas_metadata = metadata is not None and b'pandas' in metadata if has_pandas_metadata: pandas_metadata = json.loads(metadata[b'pandas'].decode('utf8')) index_columns = pandas_metadata['index_columns'] columns = pandas_metadata['columns'] column_indexes = pandas_metadata.get('column_indexes', []) table = _add_any_metadata(table, pandas_metadata) columns_metadata = pandas_metadata.get('columns', None) block_table = table # Build up a list of index columns and names while removing those columns # from the original table logical_index_names = [c['name'] for c in columns[-len(index_columns):]] for raw_name, logical_name in zip(index_columns, logical_index_names): i = schema.get_field_index(raw_name) if i != -1: col = table.column(i) col_pandas = col.to_pandas() values = col_pandas.values if hasattr(values, 'flags') and not values.flags.writeable: # ARROW-1054: in pandas 0.19.2, factorize will reject # non-writeable arrays when calling MultiIndex.from_arrays values = values.copy() index_arrays.append(pd.Series(values, dtype=col_pandas.dtype)) index_names.append( backwards_compatible_index_name(raw_name, logical_name)) block_table = block_table.remove_column( block_table.schema.get_field_index(raw_name)) # Convert an arrow table to Block from the internal pandas API result = lib.table_to_blocks(options, block_table, nthreads, memory_pool) # Construct the individual blocks converting dictionary types to pandas # categorical types and Timestamps-with-timezones types to the proper # pandas Blocks blocks = [] for item in result: block_arr = item['block'] placement = item['placement'] if 'dictionary' in item: cat = pd.Categorical(block_arr, categories=item['dictionary'], ordered=item['ordered'], fastpath=True) block = _int.make_block(cat, placement=placement, klass=_int.CategoricalBlock, fastpath=True) elif 'timezone' in item: dtype = make_datetimetz(item['timezone']) block = _int.make_block(block_arr, placement=placement, klass=_int.DatetimeTZBlock, dtype=dtype, fastpath=True) else: block = _int.make_block(block_arr, placement=placement) blocks.append(block) # Construct the row index if len(index_arrays) > 1: index = pd.MultiIndex.from_arrays(index_arrays, names=index_names) elif len(index_arrays) == 1: index = pd.Index(index_arrays[0], name=index_names[0]) else: index = pd.RangeIndex(row_count) column_strings = [x.name for x in block_table.itercolumns()] if columns_metadata is not None: columns_name_dict = dict( (str(x['name']), x['name']) for x in columns_metadata) columns_values = [ columns_name_dict[y] if y in columns_name_dict.keys() else y for y in column_strings ] else: columns_values = column_strings # If we're passed multiple column indexes then evaluate with # ast.literal_eval, since the column index values show up as a list of # tuples to_pair = ast.literal_eval if len(column_indexes) > 1 else lambda x: (x, ) # Create the column index # Construct the base index if not columns_values: columns = pd.Index(columns_values) else: columns = pd.MultiIndex.from_tuples( list(map(to_pair, columns_values)), names=[col_index['name'] for col_index in column_indexes] or None, ) # if we're reconstructing the index if has_pandas_metadata: # Get levels and labels, and provide sane defaults if the index has a # single level to avoid if/else spaghetti. levels = getattr(columns, 'levels', None) or [columns] labels = getattr(columns, 'labels', None) or [ pd.RangeIndex(len(level)) for level in levels ] # Convert each level to the dtype provided in the metadata levels_dtypes = [(level, col_index.get('numpy_type', level.dtype)) for level, col_index in zip_longest( levels, column_indexes, fillvalue={})] new_levels = [ _level if _level.dtype == _dtype else _level.astype(_dtype) for _level, _dtype in levels_dtypes ] columns = pd.MultiIndex(levels=new_levels, labels=labels, names=columns.names) # ARROW-1751: flatten a single level column MultiIndex for pandas 0.21.0 columns = _flatten_single_level_multiindex(columns) axes = [columns, index] return _int.BlockManager(blocks, axes)
def table_to_blockmanager(table, nthreads=1): import pandas.core.internals as _int from pyarrow.compat import DatetimeTZDtype import pyarrow.lib as lib block_table = table index_columns = [] index_arrays = [] index_names = [] schema = table.schema row_count = table.num_rows metadata = schema.metadata if metadata is not None and b'pandas' in metadata: pandas_metadata = json.loads(metadata[b'pandas'].decode('utf8')) index_columns = pandas_metadata['index_columns'] for name in index_columns: i = schema.get_field_index(name) if i != -1: col = table.column(i) index_name = (None if is_unnamed_index_level(name) else name) values = col.to_pandas().values if not values.flags.writeable: # ARROW-1054: in pandas 0.19.2, factorize will reject # non-writeable arrays when calling MultiIndex.from_arrays values = values.copy() index_arrays.append(values) index_names.append(index_name) block_table = block_table.remove_column( block_table.schema.get_field_index(name)) result = lib.table_to_blocks(block_table, nthreads) blocks = [] for item in result: block_arr = item['block'] placement = item['placement'] if 'dictionary' in item: cat = pd.Categorical(block_arr, categories=item['dictionary'], ordered=False, fastpath=True) block = _int.make_block(cat, placement=placement, klass=_int.CategoricalBlock, fastpath=True) elif 'timezone' in item: dtype = DatetimeTZDtype('ns', tz=item['timezone']) block = _int.make_block(block_arr, placement=placement, klass=_int.DatetimeTZBlock, dtype=dtype, fastpath=True) else: block = _int.make_block(block_arr, placement=placement) blocks.append(block) if len(index_arrays) > 1: index = pd.MultiIndex.from_arrays(index_arrays, names=index_names) elif len(index_arrays) == 1: index = pd.Index(index_arrays[0], name=index_names[0]) else: index = pd.RangeIndex(row_count) axes = [[column.name for column in block_table.itercolumns()], index] return _int.BlockManager(blocks, axes)
def table_to_blockmanager(options, table, memory_pool, nthreads=1, categoricals=None): index_columns = [] columns = [] column_indexes = [] index_arrays = [] index_names = [] schema = table.schema row_count = table.num_rows metadata = schema.metadata columns_metadata = None has_pandas_metadata = metadata is not None and b'pandas' in metadata if has_pandas_metadata: pandas_metadata = json.loads(metadata[b'pandas'].decode('utf8')) index_columns = pandas_metadata['index_columns'] columns = pandas_metadata['columns'] column_indexes = pandas_metadata.get('column_indexes', []) table = _add_any_metadata(table, pandas_metadata) columns_metadata = pandas_metadata.get('columns', None) block_table = table # Build up a list of index columns and names while removing those columns # from the original table logical_index_names = [c['name'] for c in columns[-len(index_columns):]] for raw_name, logical_name in zip(index_columns, logical_index_names): i = schema.get_field_index(raw_name) if i != -1: col = table.column(i) col_pandas = col.to_pandas() values = col_pandas.values if hasattr(values, 'flags') and not values.flags.writeable: # ARROW-1054: in pandas 0.19.2, factorize will reject # non-writeable arrays when calling MultiIndex.from_arrays values = values.copy() index_arrays.append(pd.Series(values, dtype=col_pandas.dtype)) index_names.append( _backwards_compatible_index_name(raw_name, logical_name) ) block_table = block_table.remove_column( block_table.schema.get_field_index(raw_name) ) blocks = _table_to_blocks(options, block_table, nthreads, memory_pool) # Construct the row index if len(index_arrays) > 1: index = pd.MultiIndex.from_arrays(index_arrays, names=index_names) elif len(index_arrays) == 1: index = pd.Index(index_arrays[0], name=index_names[0]) else: index = pd.RangeIndex(row_count) column_strings = [x.name for x in block_table.itercolumns()] if columns_metadata is not None: columns_name_dict = dict( (str(x['name']), x['name']) for x in columns_metadata ) columns_values = [ columns_name_dict[y] if y in columns_name_dict.keys() else y for y in column_strings ] else: columns_values = column_strings # If we're passed multiple column indexes then evaluate with # ast.literal_eval, since the column index values show up as a list of # tuples to_pair = ast.literal_eval if len(column_indexes) > 1 else lambda x: (x,) # Create the column index # Construct the base index if not columns_values: columns = pd.Index(columns_values) else: columns = pd.MultiIndex.from_tuples( list(map(to_pair, columns_values)), names=[col_index['name'] for col_index in column_indexes] or None, ) # if we're reconstructing the index if has_pandas_metadata: columns = _reconstruct_columns_from_metadata(columns, column_indexes) # ARROW-1751: flatten a single level column MultiIndex for pandas 0.21.0 columns = _flatten_single_level_multiindex(columns) axes = [columns, index] return _int.BlockManager(blocks, axes)