コード例 #1
0
ファイル: store_hdf5.py プロジェクト: adamczykm/static-frame
    def read(
        self,
        label: tp.Optional[str] = None,
        *,
        config: tp.Optional[StoreConfig] = None,
        container_type: tp.Type[Frame] = Frame,
    ) -> Frame:
        '''
        Args:
            {dtypes}
        '''
        import tables

        if config is None:
            config = StoreConfig()  # get default
        if config.dtypes:
            raise NotImplementedError(
                'using config.dtypes on HDF5 not yet supported')

        index_depth = config.index_depth
        columns_depth = config.columns_depth

        index_arrays = []
        columns_labels = []

        with tables.open_file(self._fp, mode='r') as file:
            table = file.get_node(f'/{label}')
            colnames = table.cols._v_colnames

            def blocks() -> tp.Iterator[np.ndarray]:
                for col_idx, colname in enumerate(colnames):

                    # can also do: table.read(field=colname)
                    array = table.col(colname)

                    if array.dtype.kind in DTYPE_STR_KIND:
                        array = array.astype(str)
                    array.flags.writeable = False

                    if col_idx < index_depth:
                        index_arrays.append(array)
                        continue
                    # only store column labels for those yielded
                    columns_labels.append(colname)
                    yield array

            if config.consolidate_blocks:
                data = TypeBlocks.from_blocks(
                    TypeBlocks.consolidate_blocks(blocks()))
            else:
                data = TypeBlocks.from_blocks(blocks())

        return container_type._from_data_index_arrays_column_labels(
            data=data,
            index_depth=index_depth,
            index_arrays=index_arrays,
            columns_depth=columns_depth,
            columns_labels=columns_labels,
            name=tp.cast(tp.Hashable, label)  # not sure why this is necessary
        )
コード例 #2
0
    def read_many(self,
            labels: tp.Iterable[tp.Hashable],
            *,
            config: StoreConfigMapInitializer = None,
            container_type: tp.Type[Frame] = Frame,
            ) -> tp.Iterator[Frame]:
        import tables
        config_map = StoreConfigMap.from_initializer(config)

        with tables.open_file(self._fp, mode='r') as file:
            for label in labels:
                c = config_map[label]
                label_encoded = config_map.default.label_encode(label)

                index_depth = c.index_depth
                index_constructors = c.index_constructors
                columns_depth = c.columns_depth
                columns_constructors = c.columns_constructors
                consolidate_blocks = c.consolidate_blocks
                if c.dtypes:
                    raise NotImplementedError('using config.dtypes on HDF5 not yet supported')

                index_arrays = []
                columns_labels = []

                table = file.get_node(f'/{label_encoded}')
                colnames = table.cols._v_colnames

                def blocks() -> tp.Iterator[np.ndarray]:
                    for col_idx, colname in enumerate(colnames):
                        # can also do: table.read(field=colname)
                        array = table.col(colname)
                        if array.dtype.kind in DTYPE_STR_KINDS:
                            array = array.astype(str)
                        array.flags.writeable = False

                        if col_idx < index_depth:
                            index_arrays.append(array)
                            continue
                        # only store column labels for those yielded
                        columns_labels.append(colname)
                        yield array

                if consolidate_blocks:
                    data = TypeBlocks.from_blocks(TypeBlocks.consolidate_blocks(blocks()))
                else:
                    data = TypeBlocks.from_blocks(blocks())

                # this will own_data in subsequent constructor call
                yield container_type._from_data_index_arrays_column_labels(
                        data=data,
                        index_depth=index_depth,
                        index_arrays=index_arrays,
                        index_constructors=index_constructors,
                        columns_depth=columns_depth,
                        columns_labels=columns_labels,
                        columns_constructors=columns_constructors,
                        name=label,
                        )
コード例 #3
0
    def to_type_blocks(self) -> TypeBlocks:
        '''
        Provide a correctly typed TypeBlocks representation.
        '''
        depth_count = self.depth
        if depth_count == 0:
            return TypeBlocks.from_zero_size_shape()

        return TypeBlocks.from_blocks(
            self.values_at_depth(d) for d in range(depth_count))
コード例 #4
0
ファイル: index_level.py プロジェクト: admdev8/static-frame
    def to_type_blocks(self) -> TypeBlocks:
        '''
        Provide a correctly typed TypeBlocks representation.
        '''
        try:
            depth_count = self.depth
        except StopIteration:
            # assume we have no depth or length
            return TypeBlocks.from_zero_size_shape()

        return TypeBlocks.from_blocks(
            self.values_at_depth(d) for d in range(depth_count))
コード例 #5
0
    def _index_decode(
        *,
        archive: Archive,
        metadata: tp.Dict[str, tp.Any],
        key_template_values: str,
        key_types: str,
        depth: int,
        cls_index: tp.Type['IndexBase'],
        name: NameType,
    ) -> tp.Optional['IndexBase']:
        '''Build index or columns.
        '''
        from static_frame.core.type_blocks import TypeBlocks

        if key_template_values.format(0) not in archive.labels:
            index = None
        elif depth == 1:
            index = cls_index(
                archive.read_array(key_template_values.format(0)),
                name=name,
            )
        else:
            index_tb = TypeBlocks.from_blocks(
                archive.read_array(key_template_values.format(i))
                for i in range(depth))
            index_constructors = [
                ContainerMap.str_to_cls(name) for name in metadata[key_types]
            ]
            index = cls_index._from_type_blocks(
                index_tb,  # type: ignore
                name=name,
                index_constructors=index_constructors,
            )
        return index
コード例 #6
0
            def blocks() -> tp.Iterator[np.ndarray]:
                type_blocks = []
                previous_f: tp.Optional[Frame] = None
                block_compatible = True
                reblock_compatible = True

                for f in frames:
                    if len(f.columns) != len(columns) or (f.columns !=
                                                          columns).any():
                        f = f.reindex(columns=columns, fill_value=fill_value)

                    type_blocks.append(f._blocks)
                    # column size is all the same by this point
                    if previous_f is not None:  # after the first
                        if block_compatible:
                            block_compatible &= f._blocks.block_compatible(
                                previous_f._blocks,
                                axis=1)  # only compare columns
                        if reblock_compatible:
                            reblock_compatible &= f._blocks.reblock_compatible(
                                previous_f._blocks)
                    previous_f = f

                yield from TypeBlocks.vstack_blocks_to_blocks(
                    type_blocks=type_blocks,
                    block_compatible=block_compatible,
                    reblock_compatible=reblock_compatible,
                )
コード例 #7
0
ファイル: pivot.py プロジェクト: vishalbelsare/static-frame
def pivot_items(
    blocks: TypeBlocks,
    group_fields_iloc: tp.Iterable[tp.Hashable],
    group_depth: int,
    data_field_iloc: tp.Hashable,
    func_single: UFunc,
) -> tp.Iterator[tp.Tuple[tp.Hashable, tp.Any]]:
    '''
    Specialized generator of pairs for when we hae only one data_field and one function.
    '''
    group_key = group_fields_iloc if group_depth > 1 else group_fields_iloc[
        0]  #type: ignore

    for label, _, sub in blocks.group(axis=0, key=group_key):
        # label = group if take_group else group[0]
        # will always be first
        values = sub._extract_array_column(data_field_iloc)
        yield label, func_single(values)
コード例 #8
0
ファイル: pivot.py プロジェクト: vishalbelsare/static-frame
def pivot_records_items(
    blocks: TypeBlocks, group_fields_iloc: tp.Iterable[tp.Hashable],
    group_depth: int, data_fields_iloc: tp.Iterable[tp.Hashable],
    func_single: tp.Optional[UFunc],
    func_map: tp.Sequence[tp.Tuple[tp.Hashable, UFunc]]
) -> tp.Iterator[tp.Tuple[tp.Hashable, tp.Sequence[tp.Any]]]:
    '''
    Given a Frame and pivot parameters, perform the group by ont he group_fields and within each group,
    '''
    # NOTE: this delivers results by label row for use in a Frame.from_records_items constructor
    # take_group_index = group_depth > 1
    # columns_loc_to_iloc = frame.columns._loc_to_iloc

    group_key = group_fields_iloc if group_depth > 1 else group_fields_iloc[
        0]  #type: ignore
    record_size = len(data_fields_iloc) * (1 if func_single else len(func_map))
    record: tp.List[tp.Any]

    for label, _, part in blocks.group(axis=0, key=group_key):
        # label = group_index if take_group_index else group_index[0]
        record = [None] * record_size  # This size can be pre allocated,
        pos = 0

        if func_single:
            for column_key in data_fields_iloc:
                values = part._extract_array_column(column_key)
                record[pos] = func_single(values)
                pos += 1
        else:
            for column_key in data_fields_iloc:
                values = part._extract_array_column(column_key)
                for _, func in func_map:
                    record[pos] = func(values)
                    pos += 1

        yield label, record
コード例 #9
0
def pivot_records_items_to_frame(
    *,
    blocks: TypeBlocks,
    group_fields_iloc: tp.Iterable[tp.Hashable],
    group_depth: int,
    data_fields_iloc: tp.Iterable[tp.Hashable],
    func_single: tp.Optional[UFunc],
    func_map: tp.Sequence[tp.Tuple[tp.Hashable, UFunc]],
    func_no: bool,
    kind: str,
    columns_constructor: IndexConstructor,
    columns: tp.List[tp.Hashable],
    index_constructor: IndexConstructor,
    dtypes: tp.Tuple[tp.Optional[np.dtype]],
    frame_cls: tp.Type['Frame'],
) -> 'Frame':
    '''
    Given a Frame and pivot parameters, perform the group by ont he group_fields and within each group,
    '''
    group_key = group_fields_iloc if group_depth > 1 else group_fields_iloc[
        0]  #type: ignore
    record_size = len(data_fields_iloc) * (1 if (func_single or func_no) else
                                           len(func_map))

    index_labels = []
    arrays: tp.List[tp.List[tp.Any]] = [list() for _ in range(record_size)]

    for label, _, part in blocks.group(axis=0, key=group_key, kind=kind):
        index_labels.append(label)
        if func_no:
            if len(part) != 1:
                raise RuntimeError(
                    'pivot requires aggregation of values; provide a `func` argument.'
                )
            for i, column_key in enumerate(data_fields_iloc):
                arrays[i].append(part._extract(0, column_key))
        elif func_single:
            for i, column_key in enumerate(data_fields_iloc):
                arrays[i].append(
                    func_single(part._extract_array_column(column_key)))
        else:
            i = 0
            for column_key in data_fields_iloc:
                values = part._extract_array_column(column_key)
                for _, func in func_map:
                    arrays[i].append(func(values))
                    i += 1

    def gen() -> tp.Iterator[np.ndarray]:
        for b, dtype in zip(arrays, dtypes):
            if dtype is None:
                array, _ = iterable_to_array_1d(b)
            else:
                array = np.array(b, dtype=dtype)
            array.flags.writeable = False
            yield array

    tb = TypeBlocks.from_blocks(gen())
    return frame_cls(
        tb,
        index=index_constructor(index_labels),
        columns=columns_constructor(columns),
        own_data=True,
        own_index=True,
        own_columns=True,
    )
コード例 #10
0
def pivot_core(
    *,
    frame: 'Frame',
    index_fields: tp.List[tp.Hashable],
    columns_fields: tp.List[tp.Hashable],
    data_fields: tp.List[tp.Hashable],
    func_fields: tp.Tuple[tp.Hashable, ...],
    func_single: tp.Optional[UFunc],
    func_map: tp.Sequence[tp.Tuple[tp.Hashable, UFunc]],
    fill_value: object = np.nan,
    index_constructor: IndexConstructor = None,
    kind: str = DEFAULT_FAST_SORT_KIND,
) -> 'Frame':
    '''Core implementation of Frame.pivot(). The Frame has already been reduced to just relevant columns, and all fields groups are normalized as lists of hashables.
    '''
    from static_frame.core.series import Series
    from static_frame.core.frame import Frame

    func_no = func_single is None and func_map == ()

    data_fields_len = len(data_fields)
    index_depth = len(index_fields)

    # all are lists of hashables; get converted to lists of integers
    columns_loc_to_iloc = frame.columns._loc_to_iloc
    index_fields_iloc: tp.Sequence[int] = columns_loc_to_iloc(
        index_fields)  #type: ignore
    data_fields_iloc: tp.Sequence[int] = columns_loc_to_iloc(
        data_fields)  #type: ignore
    columns_fields_iloc: tp.Sequence[int] = columns_loc_to_iloc(
        columns_fields)  #type: ignore

    # For data fields, we add the field name, not the field values, to the columns.
    columns_name = tuple(columns_fields)
    if data_fields_len > 1 or not columns_fields:
        # if no columns_fields, have to add values label
        columns_name = tuple(chain(columns_fields, ('values', )))
    if len(func_map) > 1:
        columns_name = columns_name + ('func', )

    columns_depth = len(columns_name)

    if columns_depth == 1:
        columns_name = columns_name[0]  # type: ignore
        columns_constructor = partial(frame._COLUMNS_CONSTRUCTOR,
                                      name=columns_name)
    else:
        columns_constructor = partial(
            frame._COLUMNS_HIERARCHY_CONSTRUCTOR.from_labels,
            depth_reference=columns_depth,
            name=columns_name)

    dtype_map = frame.dtypes  # returns a Series
    if func_no:
        dtypes_per_data_fields = tuple(dtype_map[field]
                                       for field in data_fields)
        if data_fields_len == 1:
            dtype_single = dtype_map[data_fields[0]]
    else:
        dtypes_per_data_fields = tuple(
            pivot_records_dtypes(
                dtype_map=dtype_map,
                data_fields=data_fields,
                func_single=func_single,
                func_map=func_map,
            ))
        if func_single and data_fields_len == 1:
            dtype_single = ufunc_dtype_to_dtype(func_single,
                                                dtype_map[data_fields[0]])

    fill_value_dtype = dtype_from_element(fill_value)

    #---------------------------------------------------------------------------
    # First major branch: if we are only grouping be index fields. This can be done in a single group-by operation on those fields. The final index is not known until the group-by is performed.

    if not columns_fields:  # group by is only index_fields
        columns = data_fields if (func_no or func_single) else tuple(
            product(data_fields, func_fields))
        # NOTE: at this time we do not automatically give back an IndexHierarchy when index_depth is == 1, as the order of the resultant values may not be hierarchable.
        name_index = index_fields[0] if index_depth == 1 else tuple(
            index_fields)
        if index_constructor:
            index_constructor = partial(index_constructor, name=name_index)
        else:
            index_constructor = partial(Index, name=name_index)

        if len(columns) == 1:
            # length of columns is equal to length of datafields, func_map not needed
            f = pivot_items_to_frame(
                blocks=frame._blocks,
                group_fields_iloc=index_fields_iloc,
                group_depth=index_depth,
                data_field_iloc=data_fields_iloc[0],
                func_single=func_single,
                frame_cls=frame.__class__,
                name=columns[0],
                dtype=dtype_single,
                index_constructor=index_constructor,
                columns_constructor=columns_constructor,
                kind=kind,
            )
        else:
            f = pivot_records_items_to_frame(
                blocks=frame._blocks,
                group_fields_iloc=index_fields_iloc,
                group_depth=index_depth,
                data_fields_iloc=data_fields_iloc,
                func_single=func_single,
                func_map=func_map,
                func_no=func_no,
                kind=kind,
                columns_constructor=columns_constructor,
                columns=columns,
                index_constructor=index_constructor,
                dtypes=dtypes_per_data_fields,
                frame_cls=frame.__class__,
            )
        columns_final = (f.columns.rename(columns_name) if columns_depth == 1
                         else columns_constructor(f.columns))
        return f.relabel(columns=columns_final)  #type: ignore

    #---------------------------------------------------------------------------
    # Second major branch: we are grouping by index and columns fields. This is done with an outer and inner gruop by. The index is calculated ahead of time.

    # avoid doing a multi-column-style selection if not needed
    if len(columns_fields) == 1:
        retuple_group_label = True
    else:
        retuple_group_label = False

    columns_loc_to_iloc = frame.columns._loc_to_iloc
    # group by on 1 or more columns fields
    # NOTE: explored doing one group on index and columns that insert into pre-allocated arrays, but that proved slower than this approach
    group_key = columns_fields_iloc if len(
        columns_fields_iloc) > 1 else columns_fields_iloc[0]

    index_outer = pivot_outer_index(
        frame=frame,
        index_fields=index_fields,
        index_depth=index_depth,
        index_constructor=index_constructor,
    )

    # collect subframes based on an index of tuples and columns of tuples (if depth > 1)
    sub_blocks = []
    sub_columns_collected: tp.List[tp.Hashable] = []

    for group, _, sub in frame._blocks.group(axis=0, key=group_key, kind=kind):
        # derive the column fields represented by this group
        sub_columns = extrapolate_column_fields(
            columns_fields,
            group if not retuple_group_label else (group, ),
            data_fields,
            func_fields,
        )
        sub_columns_collected.extend(sub_columns)

        sub_frame: Frame
        # if sub_columns length is 1, that means that we only need to extract one column out of the sub blocks
        if len(sub_columns) == 1:
            sub_blocks.append(
                pivot_items_to_block(
                    blocks=sub,
                    group_fields_iloc=index_fields_iloc,
                    group_depth=index_depth,
                    data_field_iloc=data_fields_iloc[0],
                    func_single=func_single,
                    dtype=dtype_single,
                    index_outer=index_outer,
                    fill_value=fill_value,
                    fill_value_dtype=fill_value_dtype,
                    kind=kind,
                ))
        else:
            sub_blocks.extend(
                pivot_records_items_to_blocks(
                    blocks=sub,
                    group_fields_iloc=index_fields_iloc,
                    group_depth=index_depth,
                    data_fields_iloc=data_fields_iloc,
                    func_single=func_single,
                    func_map=func_map,
                    func_no=func_no,
                    fill_value=fill_value,
                    fill_value_dtype=fill_value_dtype,
                    index_outer=index_outer,
                    dtypes=dtypes_per_data_fields,
                    kind=kind,
                ))

    tb = TypeBlocks.from_blocks(sub_blocks)
    return frame.__class__(
        tb,
        index=index_outer,
        columns=columns_constructor(sub_columns_collected),
        own_data=True,
        own_index=True,
        own_columns=True,
    )
コード例 #11
0
def pivot_items_to_frame(
    *,
    blocks: TypeBlocks,
    group_fields_iloc: tp.Iterable[tp.Hashable],
    group_depth: int,
    data_field_iloc: tp.Hashable,
    func_single: tp.Optional[UFunc],
    frame_cls: tp.Type['Frame'],
    name: NameType,
    dtype: np.dtype,
    index_constructor: IndexConstructor,
    columns_constructor: IndexConstructor,
    kind: str,
) -> 'Frame':
    '''
    Specialized generator of pairs for when we have only one data_field and one function.
    This version returns a Frame.
    '''

    from static_frame.core.series import Series
    group_key = group_fields_iloc if group_depth > 1 else group_fields_iloc[
        0]  #type: ignore

    if func_single:
        labels = []
        values = []
        for label, _, v in blocks.group_extract(
                axis=0,
                key=group_key,
                extract=data_field_iloc,
                kind=kind,
        ):
            labels.append(label)
            values.append(func_single(v))

        if dtype is None:
            array, _ = iterable_to_array_1d(values, count=len(values))
        else:
            array = np.array(values, dtype=dtype)
        array.flags.writeable = False
        index = index_constructor(labels)
        return frame_cls.from_elements(
            array,
            index=index,
            own_index=True,
            columns=(name, ),
            columns_constructor=columns_constructor,
        )
    # func_no scenario
    if group_depth == 1:
        index = index_constructor(blocks._extract_array_column(group_key))
    else:
        index = index_constructor(
            tuple(label)
            for label in blocks._extract_array(column_key=group_key))

    array = blocks._extract_array_column(data_field_iloc)
    return frame_cls.from_elements(
        array,
        index=index,
        own_index=True,
        columns=(name, ),
        columns_constructor=columns_constructor,
    )
コード例 #12
0
def pivot_items_to_block(
    *,
    blocks: TypeBlocks,
    group_fields_iloc: tp.Iterable[tp.Hashable],
    group_depth: int,
    data_field_iloc: tp.Hashable,
    func_single: tp.Optional[UFunc],
    dtype: tp.Optional[np.dtype],
    fill_value: tp.Any,
    fill_value_dtype: np.dtype,
    index_outer: 'IndexBase',
    kind: str,
) -> np.ndarray:
    '''
    Specialized generator of pairs for when we have only one data_field and one function.
    '''
    from static_frame.core.series import Series
    group_key = group_fields_iloc if group_depth > 1 else group_fields_iloc[
        0]  #type: ignore

    if func_single and dtype is not None:
        array = np.full(
            len(index_outer),
            fill_value,
            dtype=resolve_dtype(dtype, fill_value_dtype),
        )
        for label, _, values in blocks.group_extract(
                axis=0,
                key=group_key,
                extract=data_field_iloc,
                kind=kind,
        ):
            array[index_outer._loc_to_iloc(label)] = func_single(values)
        array.flags.writeable = False
        return array

    if func_single and dtype is None:

        def gen() -> tp.Iterator[tp.Tuple[int, tp.Any]]:
            for label, _, values in blocks.group_extract(
                    axis=0,
                    key=group_key,
                    extract=data_field_iloc,
                    kind=kind,
            ):
                yield index_outer._loc_to_iloc(label), func_single(values)

        post = Series.from_items(gen())
        if len(post) == len(index_outer):
            array = np.empty(len(index_outer), dtype=post.dtype)
        else:
            array = np.full(
                len(index_outer),
                fill_value,
                dtype=resolve_dtype(post.dtype, fill_value_dtype),
            )
        array[post.index.values] = post.values
        array.flags.writeable = False
        return array

    # func_no scenario as no mapping here
    if group_depth == 1:
        labels = [
            index_outer._loc_to_iloc(label)
            for label in blocks._extract_array_column(group_key)
        ]
    else:
        # NOTE: might replace _extract_array_column with an iterator of tuples
        labels = [
            index_outer._loc_to_iloc(tuple(label))
            for label in blocks._extract_array(column_key=group_key)
        ]

    values = blocks._extract_array_column(data_field_iloc)
    if len(values) == len(index_outer):
        array = np.empty(len(index_outer), dtype=dtype)
    else:
        array = np.full(
            len(index_outer),
            fill_value,
            dtype=resolve_dtype(values.dtype, fill_value_dtype),
        )
    array[labels] = values
    array.flags.writeable = False
    return array
コード例 #13
0
def pivot_records_items_to_blocks(
    *,
    blocks: TypeBlocks,
    group_fields_iloc: tp.Iterable[tp.Hashable],
    group_depth: int,
    data_fields_iloc: tp.Iterable[tp.Hashable],
    func_single: tp.Optional[UFunc],
    func_map: tp.Sequence[tp.Tuple[tp.Hashable, UFunc]],
    func_no: bool,
    fill_value: tp.Any,
    fill_value_dtype: np.dtype,
    index_outer: 'IndexBase',
    dtypes: tp.Tuple[tp.Optional[np.dtype]],
    kind: str,
) -> tp.List[np.ndarray]:
    '''
    Given a Frame and pivot parameters, perform the group by ont he group_fields and within each group,
    '''
    # NOTE: this delivers results by label, row for use in a Frame.from_records_items constructor

    group_key = group_fields_iloc if group_depth > 1 else group_fields_iloc[
        0]  #type: ignore
    arrays: tp.List[tp.Union[tp.List[tp.Any], np.ndarray]] = []
    for dtype in dtypes:
        if dtype is None:
            # we can use fill_value here, as either it will be completely replaced (and not effect dtype evaluation) or be needed (and already there)
            arrays.append([fill_value] * len(index_outer))
        else:
            arrays.append(np.empty(len(index_outer), dtype=dtype))

    # try to use the dtype specified; fill values at end if necessary
    # collect all possible ilocs, and remove as observerd; if any remain, we have fill targets
    iloc_not_found: tp.Set[int] = set(range(len(index_outer)))
    # each group forms a row, each label a value in the index
    for label, _, part in blocks.group(axis=0, key=group_key, kind=kind):
        iloc: int = index_outer._loc_to_iloc(label)  #type: ignore
        iloc_not_found.remove(iloc)
        if func_no:
            if len(part) != 1:
                raise RuntimeError(
                    'pivot requires aggregation of values; provide a `func` argument.'
                )
            for arrays_key, column_key in enumerate(data_fields_iloc):
                # this is equivalent to extracting a row, but doing so would force a type consolidation
                arrays[arrays_key][iloc] = part._extract(0, column_key)
        elif func_single:
            for arrays_key, column_key in enumerate(data_fields_iloc):
                arrays[arrays_key][iloc] = func_single(
                    part._extract_array_column(column_key))
        else:
            arrays_key = 0
            for column_key in data_fields_iloc:
                values = part._extract_array_column(column_key)
                for _, func in func_map:
                    arrays[arrays_key][iloc] = func(values)
                    arrays_key += 1

    if iloc_not_found:
        # we did not fill all arrrays and have values that need to be filled
        # order does not matter
        fill_targets = list(iloc_not_found)
        # mutate in place then make immutable
        for arrays_key in range(len(arrays)):  #pylint: disable=C0200
            array = arrays[arrays_key]
            if not array.__class__ is np.ndarray:  # a list
                array, _ = iterable_to_array_1d(array, count=len(index_outer))
                arrays[arrays_key] = array  # restore new array
            else:
                dtype_resolved = resolve_dtype(
                    array.dtype, fill_value_dtype)  # type: ignore
                if array.dtype != dtype_resolved:  # type: ignore
                    array = array.astype(dtype_resolved)  #type: ignore
                    array[fill_targets] = fill_value
                    arrays[arrays_key] = array  # re-assign new array
            array.flags.writeable = False  # type: ignore
    else:
        for arrays_key in range(len(arrays)):  #pylint: disable=C0200
            array = arrays[arrays_key]
            if not array.__class__ is np.ndarray:  # a list
                array, _ = iterable_to_array_1d(array, count=len(index_outer))
                arrays[arrays_key] = array  # re-assign new array
            array.flags.writeable = False
    return arrays
コード例 #14
0
    def _from_archive(
        cls,
        *,
        constructor: tp.Type['Frame'],
        fp: PathSpecifier,
        memory_map: bool = False,
    ) -> tp.Tuple['Frame', Archive]:
        '''
        Create a :obj:`Frame` from an npz file.
        '''
        from static_frame.core.type_blocks import TypeBlocks

        archive = cls._ARCHIVE_CLS(
            fp,
            writeable=False,
            memory_map=memory_map,
        )
        metadata = archive.read_metadata()

        # JSON will bring back tuple `name` attributes as lists; these must be converted to tuples to be hashable. Alternatives (like storing repr and using literal_eval) are slower than JSON.
        name, name_index, name_columns = (list_to_tuple(n)
                                          for n in metadata[Label.KEY_NAMES])

        block_count, depth_index, depth_columns = metadata[Label.KEY_DEPTHS]
        cls_index, cls_columns = (ContainerMap.str_to_cls(name)
                                  for name in metadata[Label.KEY_TYPES])

        index = ArchiveIndexConverter._index_decode(
            archive=archive,
            metadata=metadata,
            key_template_values=Label.FILE_TEMPLATE_VALUES_INDEX,
            key_types=Label.KEY_TYPES_INDEX,
            depth=depth_index,
            cls_index=cls_index,
            name=name_index,
        )

        columns = ArchiveIndexConverter._index_decode(
            archive=archive,
            metadata=metadata,
            key_template_values=Label.FILE_TEMPLATE_VALUES_COLUMNS,
            key_types=Label.KEY_TYPES_COLUMNS,
            depth=depth_columns,
            cls_index=cls_columns,
            name=name_columns,
        )

        tb = TypeBlocks.from_blocks(
            archive.read_array(Label.FILE_TEMPLATE_BLOCKS.format(i))
            for i in range(block_count))

        f = constructor(
            tb,
            own_data=True,
            index=index,
            own_index=False if index is None else True,
            columns=columns,
            own_columns=False if columns is None else True,
            name=name,
        )

        return f, archive
コード例 #15
0
ファイル: pivot.py プロジェクト: vishalbelsare/static-frame
def pivot_core(
    *,
    frame: 'Frame',
    index_fields: tp.List[tp.Hashable],
    columns_fields: tp.List[tp.Hashable],
    data_fields: tp.List[tp.Hashable],
    func_fields: tp.Tuple[tp.Hashable, ...],
    func_single: tp.Optional[UFunc],
    func_map: tp.Sequence[tp.Tuple[tp.Hashable, UFunc]],
    fill_value: object = np.nan,
    index_constructor: IndexConstructor = None,
) -> 'Frame':
    '''Core implementation of Frame.pivot(). The Frame has already been reduced to just relevant columns, and all fields groups are normalized as lists of hashables.
    '''
    from static_frame.core.series import Series
    from static_frame.core.frame import Frame

    data_fields_len = len(data_fields)
    index_depth = len(index_fields)

    # all are lists of hashables; get converted to lists of integers
    columns_loc_to_iloc = frame.columns._loc_to_iloc
    index_fields_iloc: tp.Sequence[int] = columns_loc_to_iloc(
        index_fields)  #type: ignore
    data_fields_iloc: tp.Sequence[int] = columns_loc_to_iloc(
        data_fields)  #type: ignore
    columns_fields_iloc: tp.Sequence[int] = columns_loc_to_iloc(
        columns_fields)  #type: ignore

    # For data fields, we add the field name, not the field values, to the columns.
    columns_name = tuple(columns_fields)
    if data_fields_len > 1 or not columns_fields:
        # if no columns_fields, have to add values label
        columns_name = tuple(chain(*columns_fields, ('values', )))
    if len(func_map) > 1:
        columns_name = columns_name + ('func', )

    columns_depth = len(columns_name)
    if columns_depth == 1:
        columns_name = columns_name[0]  # type: ignore
        columns_constructor = partial(frame._COLUMNS_CONSTRUCTOR,
                                      name=columns_name)
    else:
        columns_constructor = partial(
            frame._COLUMNS_HIERARCHY_CONSTRUCTOR.from_labels,
            depth_reference=columns_depth,
            name=columns_name)

    dtype_map = frame.dtypes
    dtypes_per_data_fields = tuple(
        pivot_records_dtypes(
            dtype_map=dtype_map,
            data_fields=data_fields,
            func_single=func_single,
            func_map=func_map,
        ))
    if func_single and data_fields_len == 1:
        dtype_single = ufunc_dtype_to_dtype(func_single,
                                            dtype_map[data_fields[0]])

    #---------------------------------------------------------------------------
    # first major branch: if we are only grouping be index fields

    if not columns_fields:  # group by is only index_fields
        columns = data_fields if func_single else tuple(
            product(data_fields, func_fields))

        # NOTE: at this time we do not automatically give back an IndexHierarchy when index_depth is == 1, as the order of the resultant values may not be hierarchable.
        name_index = index_fields[0] if index_depth == 1 else tuple(
            index_fields)
        if index_constructor:
            index_constructor = partial(index_constructor, name=name_index)
        else:
            index_constructor = partial(Index, name=name_index)

        if len(columns) == 1:
            # assert len(data_fields) == 1
            f = frame.from_series(Series.from_items(
                pivot_items(
                    blocks=frame._blocks,
                    group_fields_iloc=index_fields_iloc,
                    group_depth=index_depth,
                    data_field_iloc=data_fields_iloc[0],
                    func_single=func_single,
                ),
                name=columns[0],
                index_constructor=index_constructor,
                dtype=dtype_single,
            ),
                                  columns_constructor=columns_constructor)
        else:
            f = frame.from_records_items(
                pivot_records_items(
                    blocks=frame._blocks,
                    group_fields_iloc=index_fields_iloc,
                    group_depth=index_depth,
                    data_fields_iloc=data_fields_iloc,
                    func_single=func_single,
                    func_map=func_map,
                ),
                columns_constructor=columns_constructor,
                columns=columns,
                index_constructor=index_constructor,
                dtypes=dtypes_per_data_fields,
            )

        # have to rename columns if derived in from_concat
        columns_final = (f.columns.rename(columns_name) if columns_depth == 1
                         else columns_constructor(f.columns))
        return f.relabel(columns=columns_final)  #type: ignore

    #---------------------------------------------------------------------------
    # second major branch: we are only grouping be index and columns fields

    # avoid doing a multi-column-style selection if not needed
    if len(columns_fields) == 1:
        # columns_group = columns_fields[0]
        retuple_group_label = True
    else:
        # columns_group = columns_fields
        retuple_group_label = False

    columns_loc_to_iloc = frame.columns._loc_to_iloc
    # group by on 1 or more columns fields
    # NOTE: explored doing one group on index and coluns that insert into pre-allocated arrays, but that proved slower than this approach
    group_key = columns_fields_iloc if len(
        columns_fields_iloc) > 1 else columns_fields_iloc[0]

    index_outer = pivot_outer_index(
        frame=frame,
        index_fields=index_fields,
        index_depth=index_depth,
        index_constructor=index_constructor,
    )

    # collect subframes based on an index of tuples and columns of tuples (if depth > 1)
    sub_blocks = []
    sub_columns_collected: tp.List[tp.Hashable] = []

    # for group, sub in frame.iter_group_items(columns_group):
    for group, _, sub in frame._blocks.group(axis=0, key=group_key):
        # derive the column fields represented by this group
        sub_columns = extrapolate_column_fields(
            columns_fields, group if not retuple_group_label else (group, ),
            data_fields, func_fields)
        sub_columns_collected.extend(sub_columns)

        # sub is TypeBlocks unique value in columns_group; this may or may not have unique index fields; if not, it needs to be aggregated
        if index_depth == 1:
            sub_index_labels = sub._extract_array_column(index_fields_iloc[0])
            sub_index_labels_unique = ufunc_unique(sub_index_labels)
        else:  # match to an index of tuples; the order might not be the same as IH
            # NOTE: might be able to keep arays and concat below
            sub_index_labels = tuple(
                zip(*(sub._extract_array_column(columns_loc_to_iloc(f))
                      for f in index_fields)))
            sub_index_labels_unique = set(sub_index_labels)

        sub_frame: tp.Union[Frame, Series]

        # if sub_index_labels are not unique we need to aggregate
        if len(sub_index_labels_unique) != len(sub_index_labels):
            # if sub_columns length is 1, that means that we only need to extract one column out of the sub Frame
            if len(sub_columns) == 1:
                assert len(data_fields) == 1
                # NOTE: grouping on index_fields; can pre-process array_to_groups_and_locations
                sub_frame = Series.from_items(
                    pivot_items(
                        blocks=sub,
                        group_fields_iloc=index_fields_iloc,
                        group_depth=index_depth,
                        data_field_iloc=data_fields_iloc[0],
                        func_single=func_single,
                    ),
                    dtype=dtype_single,
                )
            else:
                sub_frame = Frame.from_records_items(
                    pivot_records_items(blocks=sub,
                                        group_fields_iloc=index_fields_iloc,
                                        group_depth=index_depth,
                                        data_fields_iloc=data_fields_iloc,
                                        func_single=func_single,
                                        func_map=func_map),
                    dtypes=dtypes_per_data_fields,
                )
        else:
            # we have unique values per index item, but may not have a complete index
            if func_single:
                # NOTE: should apply function even with func_single
                if len(data_fields) == 1:
                    sub_frame = Frame(sub._extract_array_column(
                        data_fields_iloc[0]),
                                      index=sub_index_labels,
                                      index_constructor=index_constructor,
                                      own_data=True)
                else:
                    sub_frame = Frame(sub._extract(
                        row_key=None, column_key=data_fields_iloc),
                                      index=sub_index_labels,
                                      index_constructor=index_constructor,
                                      own_data=True)
            else:

                def blocks() -> tp.Iterator[np.ndarray]:
                    for field in data_fields_iloc:
                        for _, func in func_map:
                            yield sub._extract_array_column(field)

                sub_frame = Frame(
                    TypeBlocks.from_blocks(blocks()),
                    index=sub_index_labels,
                    own_data=True,
                )

        sub_frame = sub_frame.reindex(
            index_outer,
            own_index=True,
            fill_value=fill_value,
        )
        if sub_frame.ndim == 1:
            sub_blocks.append(sub_frame.values)
        else:
            sub_blocks.extend(sub_frame._blocks._blocks)  # type: ignore

    tb = TypeBlocks.from_blocks(sub_blocks)
    return frame.__class__(
        tb,
        index=index_outer,
        columns=columns_constructor(sub_columns_collected),
        own_data=True,
        own_index=True,
        own_columns=True,
    )