Ejemplo n.º 1
0
def serialized_dict_to_dataframe(data):
    reconstructed_blocks = [
        _reconstruct_block(block) for block in data['blocks']
    ]

    block_mgr = _int.BlockManager(reconstructed_blocks, data['axes'])
    return pd.DataFrame(block_mgr)
Ejemplo n.º 2
0
def serialized_dict_to_dataframe(data):
    import pandas.core.internals as _int
    reconstructed_blocks = [_reconstruct_block(block)
                            for block in data['blocks']]

    block_mgr = _int.BlockManager(reconstructed_blocks, data['axes'])
    return _pandas_api.data_frame(block_mgr)
Ejemplo n.º 3
0
def table_to_blockmanager(options,
                          table,
                          categories=None,
                          ignore_metadata=False):
    all_columns = []
    column_indexes = []
    pandas_metadata = table.schema.pandas_metadata

    if not ignore_metadata and pandas_metadata is not None:
        all_columns = pandas_metadata['columns']
        column_indexes = pandas_metadata.get('column_indexes', [])
        index_descriptors = pandas_metadata['index_columns']
        table = _add_any_metadata(table, pandas_metadata)
        table, index = _reconstruct_index(table, index_descriptors,
                                          all_columns)
    else:
        index = pd.RangeIndex(table.num_rows)

    _check_data_column_metadata_consistency(all_columns)
    blocks = _table_to_blocks(options, table, pa.default_memory_pool(),
                              categories)
    columns = _deserialize_column_index(table, all_columns, column_indexes)

    axes = [columns, index]
    return _int.BlockManager(blocks, axes)
Ejemplo n.º 4
0
def table_to_blockmanager(options,
                          table,
                          memory_pool,
                          nthreads=1,
                          categoricals=None):
    from pyarrow.compat import DatetimeTZDtype

    index_columns = []
    columns = []
    column_indexes = []
    index_arrays = []
    index_names = []
    schema = table.schema
    row_count = table.num_rows
    metadata = schema.metadata

    has_pandas_metadata = metadata is not None and b'pandas' in metadata

    if has_pandas_metadata:
        pandas_metadata = json.loads(metadata[b'pandas'].decode('utf8'))
        index_columns = pandas_metadata['index_columns']
        columns = pandas_metadata['columns']
        column_indexes = pandas_metadata.get('column_indexes', [])
        table = _add_any_metadata(table, pandas_metadata)

    block_table = table

    index_columns_set = frozenset(index_columns)

    # 0. 'field_name' is the name of the column in the arrow Table
    # 1. 'name' is the user-facing name of the column, that is, it came from
    #    pandas
    # 2. 'field_name' and 'name' differ for index columns
    # 3. We fall back on c['name'] for backwards compatibility
    logical_index_names = [
        c['name'] for c in columns
        if c.get('field_name', c['name']) in index_columns_set
    ]

    # There must be the same number of field names and physical names
    # (fields in the arrow Table)
    assert len(logical_index_names) == len(index_columns_set)

    # It can never be the case in a released version of pyarrow that
    # c['name'] is None *and* 'field_name' is not a key in the column metadata,
    # because the change to allow c['name'] to be None and the change to add
    # 'field_name' are in the same release (0.8.0)
    assert all(
        (c['name'] is None and 'field_name' in c) or c['name'] is not None
        for c in columns)

    # Build up a list of index columns and names while removing those columns
    # from the original table
    for raw_name, logical_name in zip(index_columns, logical_index_names):
        i = schema.get_field_index(raw_name)
        if i != -1:
            col = table.column(i)
            col_pandas = col.to_pandas()
            values = col_pandas.values
            if hasattr(values, 'flags') and not values.flags.writeable:
                # ARROW-1054: in pandas 0.19.2, factorize will reject
                # non-writeable arrays when calling MultiIndex.from_arrays
                values = values.copy()

            if isinstance(col_pandas.dtype, DatetimeTZDtype):
                index_array = (
                    pd.Series(values).dt.tz_localize('utc').dt.tz_convert(
                        col_pandas.dtype.tz))
            else:
                index_array = pd.Series(values, dtype=col_pandas.dtype)
            index_arrays.append(index_array)
            index_names.append(
                _backwards_compatible_index_name(raw_name, logical_name))
            block_table = block_table.remove_column(
                block_table.schema.get_field_index(raw_name))

    blocks = _table_to_blocks(options, block_table, nthreads, memory_pool)

    # Construct the row index
    if len(index_arrays) > 1:
        index = pd.MultiIndex.from_arrays(index_arrays, names=index_names)
    elif len(index_arrays) == 1:
        index = pd.Index(index_arrays[0], name=index_names[0])
    else:
        index = pd.RangeIndex(row_count)

    column_strings = [x.name for x in block_table.itercolumns()]
    if columns:
        columns_name_dict = {
            c.get('field_name', _column_name_to_strings(c['name'])): c['name']
            for c in columns
        }
        columns_values = [
            columns_name_dict.get(name, name) for name in column_strings
        ]
    else:
        columns_values = column_strings

    # If we're passed multiple column indexes then evaluate with
    # ast.literal_eval, since the column index values show up as a list of
    # tuples
    to_pair = ast.literal_eval if len(column_indexes) > 1 else lambda x: (x, )

    # Create the column index

    # Construct the base index
    if not columns_values:
        columns = pd.Index(columns_values)
    else:
        columns = pd.MultiIndex.from_tuples(
            list(map(to_pair, columns_values)),
            names=[col_index['name'] for col_index in column_indexes] or None,
        )

    # if we're reconstructing the index
    if has_pandas_metadata:
        columns = _reconstruct_columns_from_metadata(columns, column_indexes)

    # ARROW-1751: flatten a single level column MultiIndex for pandas 0.21.0
    columns = _flatten_single_level_multiindex(columns)

    axes = [columns, index]
    return _int.BlockManager(blocks, axes)
Ejemplo n.º 5
0
def table_to_blockmanager(options, table, memory_pool, nthreads=1):
    import pandas.core.internals as _int
    import pyarrow.lib as lib

    index_columns = []
    columns = []
    column_indexes = []
    index_arrays = []
    index_names = []
    schema = table.schema
    row_count = table.num_rows
    metadata = schema.metadata
    columns_metadata = None

    has_pandas_metadata = metadata is not None and b'pandas' in metadata

    if has_pandas_metadata:
        pandas_metadata = json.loads(metadata[b'pandas'].decode('utf8'))
        index_columns = pandas_metadata['index_columns']
        columns = pandas_metadata['columns']
        column_indexes = pandas_metadata.get('column_indexes', [])
        table = _add_any_metadata(table, pandas_metadata)
        columns_metadata = pandas_metadata.get('columns', None)

    block_table = table

    # Build up a list of index columns and names while removing those columns
    # from the original table
    logical_index_names = [c['name'] for c in columns[-len(index_columns):]]
    for raw_name, logical_name in zip(index_columns, logical_index_names):
        i = schema.get_field_index(raw_name)
        if i != -1:
            col = table.column(i)
            col_pandas = col.to_pandas()
            values = col_pandas.values
            if hasattr(values, 'flags') and not values.flags.writeable:
                # ARROW-1054: in pandas 0.19.2, factorize will reject
                # non-writeable arrays when calling MultiIndex.from_arrays
                values = values.copy()

            index_arrays.append(pd.Series(values, dtype=col_pandas.dtype))
            index_names.append(
                backwards_compatible_index_name(raw_name, logical_name))
            block_table = block_table.remove_column(
                block_table.schema.get_field_index(raw_name))

    # Convert an arrow table to Block from the internal pandas API
    result = lib.table_to_blocks(options, block_table, nthreads, memory_pool)

    # Construct the individual blocks converting dictionary types to pandas
    # categorical types and Timestamps-with-timezones types to the proper
    # pandas Blocks
    blocks = []
    for item in result:
        block_arr = item['block']
        placement = item['placement']
        if 'dictionary' in item:
            cat = pd.Categorical(block_arr,
                                 categories=item['dictionary'],
                                 ordered=item['ordered'],
                                 fastpath=True)
            block = _int.make_block(cat,
                                    placement=placement,
                                    klass=_int.CategoricalBlock,
                                    fastpath=True)
        elif 'timezone' in item:
            dtype = make_datetimetz(item['timezone'])
            block = _int.make_block(block_arr,
                                    placement=placement,
                                    klass=_int.DatetimeTZBlock,
                                    dtype=dtype,
                                    fastpath=True)
        else:
            block = _int.make_block(block_arr, placement=placement)
        blocks.append(block)

    # Construct the row index
    if len(index_arrays) > 1:
        index = pd.MultiIndex.from_arrays(index_arrays, names=index_names)
    elif len(index_arrays) == 1:
        index = pd.Index(index_arrays[0], name=index_names[0])
    else:
        index = pd.RangeIndex(row_count)

    column_strings = [x.name for x in block_table.itercolumns()]
    if columns_metadata is not None:
        columns_name_dict = dict(
            (str(x['name']), x['name']) for x in columns_metadata)
        columns_values = [
            columns_name_dict[y] if y in columns_name_dict.keys() else y
            for y in column_strings
        ]
    else:
        columns_values = column_strings

    # If we're passed multiple column indexes then evaluate with
    # ast.literal_eval, since the column index values show up as a list of
    # tuples
    to_pair = ast.literal_eval if len(column_indexes) > 1 else lambda x: (x, )

    # Create the column index

    # Construct the base index
    if not columns_values:
        columns = pd.Index(columns_values)
    else:
        columns = pd.MultiIndex.from_tuples(
            list(map(to_pair, columns_values)),
            names=[col_index['name'] for col_index in column_indexes] or None,
        )

    # if we're reconstructing the index
    if has_pandas_metadata:

        # Get levels and labels, and provide sane defaults if the index has a
        # single level to avoid if/else spaghetti.
        levels = getattr(columns, 'levels', None) or [columns]
        labels = getattr(columns, 'labels', None) or [
            pd.RangeIndex(len(level)) for level in levels
        ]

        # Convert each level to the dtype provided in the metadata
        levels_dtypes = [(level, col_index.get('numpy_type', level.dtype))
                         for level, col_index in zip_longest(
                             levels, column_indexes, fillvalue={})]
        new_levels = [
            _level if _level.dtype == _dtype else _level.astype(_dtype)
            for _level, _dtype in levels_dtypes
        ]

        columns = pd.MultiIndex(levels=new_levels,
                                labels=labels,
                                names=columns.names)

    # ARROW-1751: flatten a single level column MultiIndex for pandas 0.21.0
    columns = _flatten_single_level_multiindex(columns)

    axes = [columns, index]
    return _int.BlockManager(blocks, axes)
Ejemplo n.º 6
0
def table_to_blockmanager(table, nthreads=1):
    import pandas.core.internals as _int
    from pyarrow.compat import DatetimeTZDtype
    import pyarrow.lib as lib

    block_table = table

    index_columns = []
    index_arrays = []
    index_names = []
    schema = table.schema
    row_count = table.num_rows
    metadata = schema.metadata

    if metadata is not None and b'pandas' in metadata:
        pandas_metadata = json.loads(metadata[b'pandas'].decode('utf8'))
        index_columns = pandas_metadata['index_columns']

    for name in index_columns:
        i = schema.get_field_index(name)
        if i != -1:
            col = table.column(i)
            index_name = (None if is_unnamed_index_level(name) else name)
            values = col.to_pandas().values
            if not values.flags.writeable:
                # ARROW-1054: in pandas 0.19.2, factorize will reject
                # non-writeable arrays when calling MultiIndex.from_arrays
                values = values.copy()

            index_arrays.append(values)
            index_names.append(index_name)
            block_table = block_table.remove_column(
                block_table.schema.get_field_index(name))

    result = lib.table_to_blocks(block_table, nthreads)

    blocks = []
    for item in result:
        block_arr = item['block']
        placement = item['placement']
        if 'dictionary' in item:
            cat = pd.Categorical(block_arr,
                                 categories=item['dictionary'],
                                 ordered=False,
                                 fastpath=True)
            block = _int.make_block(cat,
                                    placement=placement,
                                    klass=_int.CategoricalBlock,
                                    fastpath=True)
        elif 'timezone' in item:
            dtype = DatetimeTZDtype('ns', tz=item['timezone'])
            block = _int.make_block(block_arr,
                                    placement=placement,
                                    klass=_int.DatetimeTZBlock,
                                    dtype=dtype,
                                    fastpath=True)
        else:
            block = _int.make_block(block_arr, placement=placement)
        blocks.append(block)

    if len(index_arrays) > 1:
        index = pd.MultiIndex.from_arrays(index_arrays, names=index_names)
    elif len(index_arrays) == 1:
        index = pd.Index(index_arrays[0], name=index_names[0])
    else:
        index = pd.RangeIndex(row_count)

    axes = [[column.name for column in block_table.itercolumns()], index]

    return _int.BlockManager(blocks, axes)
Ejemplo n.º 7
0
def table_to_blockmanager(options, table, memory_pool, nthreads=1,
                          categoricals=None):
    index_columns = []
    columns = []
    column_indexes = []
    index_arrays = []
    index_names = []
    schema = table.schema
    row_count = table.num_rows
    metadata = schema.metadata
    columns_metadata = None

    has_pandas_metadata = metadata is not None and b'pandas' in metadata

    if has_pandas_metadata:
        pandas_metadata = json.loads(metadata[b'pandas'].decode('utf8'))
        index_columns = pandas_metadata['index_columns']
        columns = pandas_metadata['columns']
        column_indexes = pandas_metadata.get('column_indexes', [])
        table = _add_any_metadata(table, pandas_metadata)
        columns_metadata = pandas_metadata.get('columns', None)

    block_table = table

    # Build up a list of index columns and names while removing those columns
    # from the original table
    logical_index_names = [c['name'] for c in columns[-len(index_columns):]]
    for raw_name, logical_name in zip(index_columns, logical_index_names):
        i = schema.get_field_index(raw_name)
        if i != -1:
            col = table.column(i)
            col_pandas = col.to_pandas()
            values = col_pandas.values
            if hasattr(values, 'flags') and not values.flags.writeable:
                # ARROW-1054: in pandas 0.19.2, factorize will reject
                # non-writeable arrays when calling MultiIndex.from_arrays
                values = values.copy()

            index_arrays.append(pd.Series(values, dtype=col_pandas.dtype))
            index_names.append(
                _backwards_compatible_index_name(raw_name, logical_name)
            )
            block_table = block_table.remove_column(
                block_table.schema.get_field_index(raw_name)
            )

    blocks = _table_to_blocks(options, block_table, nthreads, memory_pool)

    # Construct the row index
    if len(index_arrays) > 1:
        index = pd.MultiIndex.from_arrays(index_arrays, names=index_names)
    elif len(index_arrays) == 1:
        index = pd.Index(index_arrays[0], name=index_names[0])
    else:
        index = pd.RangeIndex(row_count)

    column_strings = [x.name for x in block_table.itercolumns()]
    if columns_metadata is not None:
        columns_name_dict = dict(
            (str(x['name']), x['name'])
            for x in columns_metadata
        )
        columns_values = [
            columns_name_dict[y]
            if y in columns_name_dict.keys() else y
            for y in column_strings
        ]
    else:
        columns_values = column_strings

    # If we're passed multiple column indexes then evaluate with
    # ast.literal_eval, since the column index values show up as a list of
    # tuples
    to_pair = ast.literal_eval if len(column_indexes) > 1 else lambda x: (x,)

    # Create the column index

    # Construct the base index
    if not columns_values:
        columns = pd.Index(columns_values)
    else:
        columns = pd.MultiIndex.from_tuples(
            list(map(to_pair, columns_values)),
            names=[col_index['name'] for col_index in column_indexes] or None,
        )

    # if we're reconstructing the index
    if has_pandas_metadata:
        columns = _reconstruct_columns_from_metadata(columns, column_indexes)

    # ARROW-1751: flatten a single level column MultiIndex for pandas 0.21.0
    columns = _flatten_single_level_multiindex(columns)

    axes = [columns, index]
    return _int.BlockManager(blocks, axes)