def read( self, label: tp.Optional[str] = None, *, config: tp.Optional[StoreConfig] = None, container_type: tp.Type[Frame] = Frame, ) -> Frame: ''' Args: {dtypes} ''' import tables if config is None: config = StoreConfig() # get default if config.dtypes: raise NotImplementedError( 'using config.dtypes on HDF5 not yet supported') index_depth = config.index_depth columns_depth = config.columns_depth index_arrays = [] columns_labels = [] with tables.open_file(self._fp, mode='r') as file: table = file.get_node(f'/{label}') colnames = table.cols._v_colnames def blocks() -> tp.Iterator[np.ndarray]: for col_idx, colname in enumerate(colnames): # can also do: table.read(field=colname) array = table.col(colname) if array.dtype.kind in DTYPE_STR_KIND: array = array.astype(str) array.flags.writeable = False if col_idx < index_depth: index_arrays.append(array) continue # only store column labels for those yielded columns_labels.append(colname) yield array if config.consolidate_blocks: data = TypeBlocks.from_blocks( TypeBlocks.consolidate_blocks(blocks())) else: data = TypeBlocks.from_blocks(blocks()) return container_type._from_data_index_arrays_column_labels( data=data, index_depth=index_depth, index_arrays=index_arrays, columns_depth=columns_depth, columns_labels=columns_labels, name=tp.cast(tp.Hashable, label) # not sure why this is necessary )
def read_many(self, labels: tp.Iterable[tp.Hashable], *, config: StoreConfigMapInitializer = None, container_type: tp.Type[Frame] = Frame, ) -> tp.Iterator[Frame]: import tables config_map = StoreConfigMap.from_initializer(config) with tables.open_file(self._fp, mode='r') as file: for label in labels: c = config_map[label] label_encoded = config_map.default.label_encode(label) index_depth = c.index_depth index_constructors = c.index_constructors columns_depth = c.columns_depth columns_constructors = c.columns_constructors consolidate_blocks = c.consolidate_blocks if c.dtypes: raise NotImplementedError('using config.dtypes on HDF5 not yet supported') index_arrays = [] columns_labels = [] table = file.get_node(f'/{label_encoded}') colnames = table.cols._v_colnames def blocks() -> tp.Iterator[np.ndarray]: for col_idx, colname in enumerate(colnames): # can also do: table.read(field=colname) array = table.col(colname) if array.dtype.kind in DTYPE_STR_KINDS: array = array.astype(str) array.flags.writeable = False if col_idx < index_depth: index_arrays.append(array) continue # only store column labels for those yielded columns_labels.append(colname) yield array if consolidate_blocks: data = TypeBlocks.from_blocks(TypeBlocks.consolidate_blocks(blocks())) else: data = TypeBlocks.from_blocks(blocks()) # this will own_data in subsequent constructor call yield container_type._from_data_index_arrays_column_labels( data=data, index_depth=index_depth, index_arrays=index_arrays, index_constructors=index_constructors, columns_depth=columns_depth, columns_labels=columns_labels, columns_constructors=columns_constructors, name=label, )
def to_type_blocks(self) -> TypeBlocks: ''' Provide a correctly typed TypeBlocks representation. ''' depth_count = self.depth if depth_count == 0: return TypeBlocks.from_zero_size_shape() return TypeBlocks.from_blocks( self.values_at_depth(d) for d in range(depth_count))
def to_type_blocks(self) -> TypeBlocks: ''' Provide a correctly typed TypeBlocks representation. ''' try: depth_count = self.depth except StopIteration: # assume we have no depth or length return TypeBlocks.from_zero_size_shape() return TypeBlocks.from_blocks( self.values_at_depth(d) for d in range(depth_count))
def _index_decode( *, archive: Archive, metadata: tp.Dict[str, tp.Any], key_template_values: str, key_types: str, depth: int, cls_index: tp.Type['IndexBase'], name: NameType, ) -> tp.Optional['IndexBase']: '''Build index or columns. ''' from static_frame.core.type_blocks import TypeBlocks if key_template_values.format(0) not in archive.labels: index = None elif depth == 1: index = cls_index( archive.read_array(key_template_values.format(0)), name=name, ) else: index_tb = TypeBlocks.from_blocks( archive.read_array(key_template_values.format(i)) for i in range(depth)) index_constructors = [ ContainerMap.str_to_cls(name) for name in metadata[key_types] ] index = cls_index._from_type_blocks( index_tb, # type: ignore name=name, index_constructors=index_constructors, ) return index
def blocks() -> tp.Iterator[np.ndarray]: type_blocks = [] previous_f: tp.Optional[Frame] = None block_compatible = True reblock_compatible = True for f in frames: if len(f.columns) != len(columns) or (f.columns != columns).any(): f = f.reindex(columns=columns, fill_value=fill_value) type_blocks.append(f._blocks) # column size is all the same by this point if previous_f is not None: # after the first if block_compatible: block_compatible &= f._blocks.block_compatible( previous_f._blocks, axis=1) # only compare columns if reblock_compatible: reblock_compatible &= f._blocks.reblock_compatible( previous_f._blocks) previous_f = f yield from TypeBlocks.vstack_blocks_to_blocks( type_blocks=type_blocks, block_compatible=block_compatible, reblock_compatible=reblock_compatible, )
def pivot_items( blocks: TypeBlocks, group_fields_iloc: tp.Iterable[tp.Hashable], group_depth: int, data_field_iloc: tp.Hashable, func_single: UFunc, ) -> tp.Iterator[tp.Tuple[tp.Hashable, tp.Any]]: ''' Specialized generator of pairs for when we hae only one data_field and one function. ''' group_key = group_fields_iloc if group_depth > 1 else group_fields_iloc[ 0] #type: ignore for label, _, sub in blocks.group(axis=0, key=group_key): # label = group if take_group else group[0] # will always be first values = sub._extract_array_column(data_field_iloc) yield label, func_single(values)
def pivot_records_items( blocks: TypeBlocks, group_fields_iloc: tp.Iterable[tp.Hashable], group_depth: int, data_fields_iloc: tp.Iterable[tp.Hashable], func_single: tp.Optional[UFunc], func_map: tp.Sequence[tp.Tuple[tp.Hashable, UFunc]] ) -> tp.Iterator[tp.Tuple[tp.Hashable, tp.Sequence[tp.Any]]]: ''' Given a Frame and pivot parameters, perform the group by ont he group_fields and within each group, ''' # NOTE: this delivers results by label row for use in a Frame.from_records_items constructor # take_group_index = group_depth > 1 # columns_loc_to_iloc = frame.columns._loc_to_iloc group_key = group_fields_iloc if group_depth > 1 else group_fields_iloc[ 0] #type: ignore record_size = len(data_fields_iloc) * (1 if func_single else len(func_map)) record: tp.List[tp.Any] for label, _, part in blocks.group(axis=0, key=group_key): # label = group_index if take_group_index else group_index[0] record = [None] * record_size # This size can be pre allocated, pos = 0 if func_single: for column_key in data_fields_iloc: values = part._extract_array_column(column_key) record[pos] = func_single(values) pos += 1 else: for column_key in data_fields_iloc: values = part._extract_array_column(column_key) for _, func in func_map: record[pos] = func(values) pos += 1 yield label, record
def pivot_records_items_to_frame( *, blocks: TypeBlocks, group_fields_iloc: tp.Iterable[tp.Hashable], group_depth: int, data_fields_iloc: tp.Iterable[tp.Hashable], func_single: tp.Optional[UFunc], func_map: tp.Sequence[tp.Tuple[tp.Hashable, UFunc]], func_no: bool, kind: str, columns_constructor: IndexConstructor, columns: tp.List[tp.Hashable], index_constructor: IndexConstructor, dtypes: tp.Tuple[tp.Optional[np.dtype]], frame_cls: tp.Type['Frame'], ) -> 'Frame': ''' Given a Frame and pivot parameters, perform the group by ont he group_fields and within each group, ''' group_key = group_fields_iloc if group_depth > 1 else group_fields_iloc[ 0] #type: ignore record_size = len(data_fields_iloc) * (1 if (func_single or func_no) else len(func_map)) index_labels = [] arrays: tp.List[tp.List[tp.Any]] = [list() for _ in range(record_size)] for label, _, part in blocks.group(axis=0, key=group_key, kind=kind): index_labels.append(label) if func_no: if len(part) != 1: raise RuntimeError( 'pivot requires aggregation of values; provide a `func` argument.' ) for i, column_key in enumerate(data_fields_iloc): arrays[i].append(part._extract(0, column_key)) elif func_single: for i, column_key in enumerate(data_fields_iloc): arrays[i].append( func_single(part._extract_array_column(column_key))) else: i = 0 for column_key in data_fields_iloc: values = part._extract_array_column(column_key) for _, func in func_map: arrays[i].append(func(values)) i += 1 def gen() -> tp.Iterator[np.ndarray]: for b, dtype in zip(arrays, dtypes): if dtype is None: array, _ = iterable_to_array_1d(b) else: array = np.array(b, dtype=dtype) array.flags.writeable = False yield array tb = TypeBlocks.from_blocks(gen()) return frame_cls( tb, index=index_constructor(index_labels), columns=columns_constructor(columns), own_data=True, own_index=True, own_columns=True, )
def pivot_core( *, frame: 'Frame', index_fields: tp.List[tp.Hashable], columns_fields: tp.List[tp.Hashable], data_fields: tp.List[tp.Hashable], func_fields: tp.Tuple[tp.Hashable, ...], func_single: tp.Optional[UFunc], func_map: tp.Sequence[tp.Tuple[tp.Hashable, UFunc]], fill_value: object = np.nan, index_constructor: IndexConstructor = None, kind: str = DEFAULT_FAST_SORT_KIND, ) -> 'Frame': '''Core implementation of Frame.pivot(). The Frame has already been reduced to just relevant columns, and all fields groups are normalized as lists of hashables. ''' from static_frame.core.series import Series from static_frame.core.frame import Frame func_no = func_single is None and func_map == () data_fields_len = len(data_fields) index_depth = len(index_fields) # all are lists of hashables; get converted to lists of integers columns_loc_to_iloc = frame.columns._loc_to_iloc index_fields_iloc: tp.Sequence[int] = columns_loc_to_iloc( index_fields) #type: ignore data_fields_iloc: tp.Sequence[int] = columns_loc_to_iloc( data_fields) #type: ignore columns_fields_iloc: tp.Sequence[int] = columns_loc_to_iloc( columns_fields) #type: ignore # For data fields, we add the field name, not the field values, to the columns. columns_name = tuple(columns_fields) if data_fields_len > 1 or not columns_fields: # if no columns_fields, have to add values label columns_name = tuple(chain(columns_fields, ('values', ))) if len(func_map) > 1: columns_name = columns_name + ('func', ) columns_depth = len(columns_name) if columns_depth == 1: columns_name = columns_name[0] # type: ignore columns_constructor = partial(frame._COLUMNS_CONSTRUCTOR, name=columns_name) else: columns_constructor = partial( frame._COLUMNS_HIERARCHY_CONSTRUCTOR.from_labels, depth_reference=columns_depth, name=columns_name) dtype_map = frame.dtypes # returns a Series if func_no: dtypes_per_data_fields = tuple(dtype_map[field] for field in data_fields) if data_fields_len == 1: dtype_single = dtype_map[data_fields[0]] else: dtypes_per_data_fields = tuple( pivot_records_dtypes( dtype_map=dtype_map, data_fields=data_fields, func_single=func_single, func_map=func_map, )) if func_single and data_fields_len == 1: dtype_single = ufunc_dtype_to_dtype(func_single, dtype_map[data_fields[0]]) fill_value_dtype = dtype_from_element(fill_value) #--------------------------------------------------------------------------- # First major branch: if we are only grouping be index fields. This can be done in a single group-by operation on those fields. The final index is not known until the group-by is performed. if not columns_fields: # group by is only index_fields columns = data_fields if (func_no or func_single) else tuple( product(data_fields, func_fields)) # NOTE: at this time we do not automatically give back an IndexHierarchy when index_depth is == 1, as the order of the resultant values may not be hierarchable. name_index = index_fields[0] if index_depth == 1 else tuple( index_fields) if index_constructor: index_constructor = partial(index_constructor, name=name_index) else: index_constructor = partial(Index, name=name_index) if len(columns) == 1: # length of columns is equal to length of datafields, func_map not needed f = pivot_items_to_frame( blocks=frame._blocks, group_fields_iloc=index_fields_iloc, group_depth=index_depth, data_field_iloc=data_fields_iloc[0], func_single=func_single, frame_cls=frame.__class__, name=columns[0], dtype=dtype_single, index_constructor=index_constructor, columns_constructor=columns_constructor, kind=kind, ) else: f = pivot_records_items_to_frame( blocks=frame._blocks, group_fields_iloc=index_fields_iloc, group_depth=index_depth, data_fields_iloc=data_fields_iloc, func_single=func_single, func_map=func_map, func_no=func_no, kind=kind, columns_constructor=columns_constructor, columns=columns, index_constructor=index_constructor, dtypes=dtypes_per_data_fields, frame_cls=frame.__class__, ) columns_final = (f.columns.rename(columns_name) if columns_depth == 1 else columns_constructor(f.columns)) return f.relabel(columns=columns_final) #type: ignore #--------------------------------------------------------------------------- # Second major branch: we are grouping by index and columns fields. This is done with an outer and inner gruop by. The index is calculated ahead of time. # avoid doing a multi-column-style selection if not needed if len(columns_fields) == 1: retuple_group_label = True else: retuple_group_label = False columns_loc_to_iloc = frame.columns._loc_to_iloc # group by on 1 or more columns fields # NOTE: explored doing one group on index and columns that insert into pre-allocated arrays, but that proved slower than this approach group_key = columns_fields_iloc if len( columns_fields_iloc) > 1 else columns_fields_iloc[0] index_outer = pivot_outer_index( frame=frame, index_fields=index_fields, index_depth=index_depth, index_constructor=index_constructor, ) # collect subframes based on an index of tuples and columns of tuples (if depth > 1) sub_blocks = [] sub_columns_collected: tp.List[tp.Hashable] = [] for group, _, sub in frame._blocks.group(axis=0, key=group_key, kind=kind): # derive the column fields represented by this group sub_columns = extrapolate_column_fields( columns_fields, group if not retuple_group_label else (group, ), data_fields, func_fields, ) sub_columns_collected.extend(sub_columns) sub_frame: Frame # if sub_columns length is 1, that means that we only need to extract one column out of the sub blocks if len(sub_columns) == 1: sub_blocks.append( pivot_items_to_block( blocks=sub, group_fields_iloc=index_fields_iloc, group_depth=index_depth, data_field_iloc=data_fields_iloc[0], func_single=func_single, dtype=dtype_single, index_outer=index_outer, fill_value=fill_value, fill_value_dtype=fill_value_dtype, kind=kind, )) else: sub_blocks.extend( pivot_records_items_to_blocks( blocks=sub, group_fields_iloc=index_fields_iloc, group_depth=index_depth, data_fields_iloc=data_fields_iloc, func_single=func_single, func_map=func_map, func_no=func_no, fill_value=fill_value, fill_value_dtype=fill_value_dtype, index_outer=index_outer, dtypes=dtypes_per_data_fields, kind=kind, )) tb = TypeBlocks.from_blocks(sub_blocks) return frame.__class__( tb, index=index_outer, columns=columns_constructor(sub_columns_collected), own_data=True, own_index=True, own_columns=True, )
def pivot_items_to_frame( *, blocks: TypeBlocks, group_fields_iloc: tp.Iterable[tp.Hashable], group_depth: int, data_field_iloc: tp.Hashable, func_single: tp.Optional[UFunc], frame_cls: tp.Type['Frame'], name: NameType, dtype: np.dtype, index_constructor: IndexConstructor, columns_constructor: IndexConstructor, kind: str, ) -> 'Frame': ''' Specialized generator of pairs for when we have only one data_field and one function. This version returns a Frame. ''' from static_frame.core.series import Series group_key = group_fields_iloc if group_depth > 1 else group_fields_iloc[ 0] #type: ignore if func_single: labels = [] values = [] for label, _, v in blocks.group_extract( axis=0, key=group_key, extract=data_field_iloc, kind=kind, ): labels.append(label) values.append(func_single(v)) if dtype is None: array, _ = iterable_to_array_1d(values, count=len(values)) else: array = np.array(values, dtype=dtype) array.flags.writeable = False index = index_constructor(labels) return frame_cls.from_elements( array, index=index, own_index=True, columns=(name, ), columns_constructor=columns_constructor, ) # func_no scenario if group_depth == 1: index = index_constructor(blocks._extract_array_column(group_key)) else: index = index_constructor( tuple(label) for label in blocks._extract_array(column_key=group_key)) array = blocks._extract_array_column(data_field_iloc) return frame_cls.from_elements( array, index=index, own_index=True, columns=(name, ), columns_constructor=columns_constructor, )
def pivot_items_to_block( *, blocks: TypeBlocks, group_fields_iloc: tp.Iterable[tp.Hashable], group_depth: int, data_field_iloc: tp.Hashable, func_single: tp.Optional[UFunc], dtype: tp.Optional[np.dtype], fill_value: tp.Any, fill_value_dtype: np.dtype, index_outer: 'IndexBase', kind: str, ) -> np.ndarray: ''' Specialized generator of pairs for when we have only one data_field and one function. ''' from static_frame.core.series import Series group_key = group_fields_iloc if group_depth > 1 else group_fields_iloc[ 0] #type: ignore if func_single and dtype is not None: array = np.full( len(index_outer), fill_value, dtype=resolve_dtype(dtype, fill_value_dtype), ) for label, _, values in blocks.group_extract( axis=0, key=group_key, extract=data_field_iloc, kind=kind, ): array[index_outer._loc_to_iloc(label)] = func_single(values) array.flags.writeable = False return array if func_single and dtype is None: def gen() -> tp.Iterator[tp.Tuple[int, tp.Any]]: for label, _, values in blocks.group_extract( axis=0, key=group_key, extract=data_field_iloc, kind=kind, ): yield index_outer._loc_to_iloc(label), func_single(values) post = Series.from_items(gen()) if len(post) == len(index_outer): array = np.empty(len(index_outer), dtype=post.dtype) else: array = np.full( len(index_outer), fill_value, dtype=resolve_dtype(post.dtype, fill_value_dtype), ) array[post.index.values] = post.values array.flags.writeable = False return array # func_no scenario as no mapping here if group_depth == 1: labels = [ index_outer._loc_to_iloc(label) for label in blocks._extract_array_column(group_key) ] else: # NOTE: might replace _extract_array_column with an iterator of tuples labels = [ index_outer._loc_to_iloc(tuple(label)) for label in blocks._extract_array(column_key=group_key) ] values = blocks._extract_array_column(data_field_iloc) if len(values) == len(index_outer): array = np.empty(len(index_outer), dtype=dtype) else: array = np.full( len(index_outer), fill_value, dtype=resolve_dtype(values.dtype, fill_value_dtype), ) array[labels] = values array.flags.writeable = False return array
def pivot_records_items_to_blocks( *, blocks: TypeBlocks, group_fields_iloc: tp.Iterable[tp.Hashable], group_depth: int, data_fields_iloc: tp.Iterable[tp.Hashable], func_single: tp.Optional[UFunc], func_map: tp.Sequence[tp.Tuple[tp.Hashable, UFunc]], func_no: bool, fill_value: tp.Any, fill_value_dtype: np.dtype, index_outer: 'IndexBase', dtypes: tp.Tuple[tp.Optional[np.dtype]], kind: str, ) -> tp.List[np.ndarray]: ''' Given a Frame and pivot parameters, perform the group by ont he group_fields and within each group, ''' # NOTE: this delivers results by label, row for use in a Frame.from_records_items constructor group_key = group_fields_iloc if group_depth > 1 else group_fields_iloc[ 0] #type: ignore arrays: tp.List[tp.Union[tp.List[tp.Any], np.ndarray]] = [] for dtype in dtypes: if dtype is None: # we can use fill_value here, as either it will be completely replaced (and not effect dtype evaluation) or be needed (and already there) arrays.append([fill_value] * len(index_outer)) else: arrays.append(np.empty(len(index_outer), dtype=dtype)) # try to use the dtype specified; fill values at end if necessary # collect all possible ilocs, and remove as observerd; if any remain, we have fill targets iloc_not_found: tp.Set[int] = set(range(len(index_outer))) # each group forms a row, each label a value in the index for label, _, part in blocks.group(axis=0, key=group_key, kind=kind): iloc: int = index_outer._loc_to_iloc(label) #type: ignore iloc_not_found.remove(iloc) if func_no: if len(part) != 1: raise RuntimeError( 'pivot requires aggregation of values; provide a `func` argument.' ) for arrays_key, column_key in enumerate(data_fields_iloc): # this is equivalent to extracting a row, but doing so would force a type consolidation arrays[arrays_key][iloc] = part._extract(0, column_key) elif func_single: for arrays_key, column_key in enumerate(data_fields_iloc): arrays[arrays_key][iloc] = func_single( part._extract_array_column(column_key)) else: arrays_key = 0 for column_key in data_fields_iloc: values = part._extract_array_column(column_key) for _, func in func_map: arrays[arrays_key][iloc] = func(values) arrays_key += 1 if iloc_not_found: # we did not fill all arrrays and have values that need to be filled # order does not matter fill_targets = list(iloc_not_found) # mutate in place then make immutable for arrays_key in range(len(arrays)): #pylint: disable=C0200 array = arrays[arrays_key] if not array.__class__ is np.ndarray: # a list array, _ = iterable_to_array_1d(array, count=len(index_outer)) arrays[arrays_key] = array # restore new array else: dtype_resolved = resolve_dtype( array.dtype, fill_value_dtype) # type: ignore if array.dtype != dtype_resolved: # type: ignore array = array.astype(dtype_resolved) #type: ignore array[fill_targets] = fill_value arrays[arrays_key] = array # re-assign new array array.flags.writeable = False # type: ignore else: for arrays_key in range(len(arrays)): #pylint: disable=C0200 array = arrays[arrays_key] if not array.__class__ is np.ndarray: # a list array, _ = iterable_to_array_1d(array, count=len(index_outer)) arrays[arrays_key] = array # re-assign new array array.flags.writeable = False return arrays
def _from_archive( cls, *, constructor: tp.Type['Frame'], fp: PathSpecifier, memory_map: bool = False, ) -> tp.Tuple['Frame', Archive]: ''' Create a :obj:`Frame` from an npz file. ''' from static_frame.core.type_blocks import TypeBlocks archive = cls._ARCHIVE_CLS( fp, writeable=False, memory_map=memory_map, ) metadata = archive.read_metadata() # JSON will bring back tuple `name` attributes as lists; these must be converted to tuples to be hashable. Alternatives (like storing repr and using literal_eval) are slower than JSON. name, name_index, name_columns = (list_to_tuple(n) for n in metadata[Label.KEY_NAMES]) block_count, depth_index, depth_columns = metadata[Label.KEY_DEPTHS] cls_index, cls_columns = (ContainerMap.str_to_cls(name) for name in metadata[Label.KEY_TYPES]) index = ArchiveIndexConverter._index_decode( archive=archive, metadata=metadata, key_template_values=Label.FILE_TEMPLATE_VALUES_INDEX, key_types=Label.KEY_TYPES_INDEX, depth=depth_index, cls_index=cls_index, name=name_index, ) columns = ArchiveIndexConverter._index_decode( archive=archive, metadata=metadata, key_template_values=Label.FILE_TEMPLATE_VALUES_COLUMNS, key_types=Label.KEY_TYPES_COLUMNS, depth=depth_columns, cls_index=cls_columns, name=name_columns, ) tb = TypeBlocks.from_blocks( archive.read_array(Label.FILE_TEMPLATE_BLOCKS.format(i)) for i in range(block_count)) f = constructor( tb, own_data=True, index=index, own_index=False if index is None else True, columns=columns, own_columns=False if columns is None else True, name=name, ) return f, archive
def pivot_core( *, frame: 'Frame', index_fields: tp.List[tp.Hashable], columns_fields: tp.List[tp.Hashable], data_fields: tp.List[tp.Hashable], func_fields: tp.Tuple[tp.Hashable, ...], func_single: tp.Optional[UFunc], func_map: tp.Sequence[tp.Tuple[tp.Hashable, UFunc]], fill_value: object = np.nan, index_constructor: IndexConstructor = None, ) -> 'Frame': '''Core implementation of Frame.pivot(). The Frame has already been reduced to just relevant columns, and all fields groups are normalized as lists of hashables. ''' from static_frame.core.series import Series from static_frame.core.frame import Frame data_fields_len = len(data_fields) index_depth = len(index_fields) # all are lists of hashables; get converted to lists of integers columns_loc_to_iloc = frame.columns._loc_to_iloc index_fields_iloc: tp.Sequence[int] = columns_loc_to_iloc( index_fields) #type: ignore data_fields_iloc: tp.Sequence[int] = columns_loc_to_iloc( data_fields) #type: ignore columns_fields_iloc: tp.Sequence[int] = columns_loc_to_iloc( columns_fields) #type: ignore # For data fields, we add the field name, not the field values, to the columns. columns_name = tuple(columns_fields) if data_fields_len > 1 or not columns_fields: # if no columns_fields, have to add values label columns_name = tuple(chain(*columns_fields, ('values', ))) if len(func_map) > 1: columns_name = columns_name + ('func', ) columns_depth = len(columns_name) if columns_depth == 1: columns_name = columns_name[0] # type: ignore columns_constructor = partial(frame._COLUMNS_CONSTRUCTOR, name=columns_name) else: columns_constructor = partial( frame._COLUMNS_HIERARCHY_CONSTRUCTOR.from_labels, depth_reference=columns_depth, name=columns_name) dtype_map = frame.dtypes dtypes_per_data_fields = tuple( pivot_records_dtypes( dtype_map=dtype_map, data_fields=data_fields, func_single=func_single, func_map=func_map, )) if func_single and data_fields_len == 1: dtype_single = ufunc_dtype_to_dtype(func_single, dtype_map[data_fields[0]]) #--------------------------------------------------------------------------- # first major branch: if we are only grouping be index fields if not columns_fields: # group by is only index_fields columns = data_fields if func_single else tuple( product(data_fields, func_fields)) # NOTE: at this time we do not automatically give back an IndexHierarchy when index_depth is == 1, as the order of the resultant values may not be hierarchable. name_index = index_fields[0] if index_depth == 1 else tuple( index_fields) if index_constructor: index_constructor = partial(index_constructor, name=name_index) else: index_constructor = partial(Index, name=name_index) if len(columns) == 1: # assert len(data_fields) == 1 f = frame.from_series(Series.from_items( pivot_items( blocks=frame._blocks, group_fields_iloc=index_fields_iloc, group_depth=index_depth, data_field_iloc=data_fields_iloc[0], func_single=func_single, ), name=columns[0], index_constructor=index_constructor, dtype=dtype_single, ), columns_constructor=columns_constructor) else: f = frame.from_records_items( pivot_records_items( blocks=frame._blocks, group_fields_iloc=index_fields_iloc, group_depth=index_depth, data_fields_iloc=data_fields_iloc, func_single=func_single, func_map=func_map, ), columns_constructor=columns_constructor, columns=columns, index_constructor=index_constructor, dtypes=dtypes_per_data_fields, ) # have to rename columns if derived in from_concat columns_final = (f.columns.rename(columns_name) if columns_depth == 1 else columns_constructor(f.columns)) return f.relabel(columns=columns_final) #type: ignore #--------------------------------------------------------------------------- # second major branch: we are only grouping be index and columns fields # avoid doing a multi-column-style selection if not needed if len(columns_fields) == 1: # columns_group = columns_fields[0] retuple_group_label = True else: # columns_group = columns_fields retuple_group_label = False columns_loc_to_iloc = frame.columns._loc_to_iloc # group by on 1 or more columns fields # NOTE: explored doing one group on index and coluns that insert into pre-allocated arrays, but that proved slower than this approach group_key = columns_fields_iloc if len( columns_fields_iloc) > 1 else columns_fields_iloc[0] index_outer = pivot_outer_index( frame=frame, index_fields=index_fields, index_depth=index_depth, index_constructor=index_constructor, ) # collect subframes based on an index of tuples and columns of tuples (if depth > 1) sub_blocks = [] sub_columns_collected: tp.List[tp.Hashable] = [] # for group, sub in frame.iter_group_items(columns_group): for group, _, sub in frame._blocks.group(axis=0, key=group_key): # derive the column fields represented by this group sub_columns = extrapolate_column_fields( columns_fields, group if not retuple_group_label else (group, ), data_fields, func_fields) sub_columns_collected.extend(sub_columns) # sub is TypeBlocks unique value in columns_group; this may or may not have unique index fields; if not, it needs to be aggregated if index_depth == 1: sub_index_labels = sub._extract_array_column(index_fields_iloc[0]) sub_index_labels_unique = ufunc_unique(sub_index_labels) else: # match to an index of tuples; the order might not be the same as IH # NOTE: might be able to keep arays and concat below sub_index_labels = tuple( zip(*(sub._extract_array_column(columns_loc_to_iloc(f)) for f in index_fields))) sub_index_labels_unique = set(sub_index_labels) sub_frame: tp.Union[Frame, Series] # if sub_index_labels are not unique we need to aggregate if len(sub_index_labels_unique) != len(sub_index_labels): # if sub_columns length is 1, that means that we only need to extract one column out of the sub Frame if len(sub_columns) == 1: assert len(data_fields) == 1 # NOTE: grouping on index_fields; can pre-process array_to_groups_and_locations sub_frame = Series.from_items( pivot_items( blocks=sub, group_fields_iloc=index_fields_iloc, group_depth=index_depth, data_field_iloc=data_fields_iloc[0], func_single=func_single, ), dtype=dtype_single, ) else: sub_frame = Frame.from_records_items( pivot_records_items(blocks=sub, group_fields_iloc=index_fields_iloc, group_depth=index_depth, data_fields_iloc=data_fields_iloc, func_single=func_single, func_map=func_map), dtypes=dtypes_per_data_fields, ) else: # we have unique values per index item, but may not have a complete index if func_single: # NOTE: should apply function even with func_single if len(data_fields) == 1: sub_frame = Frame(sub._extract_array_column( data_fields_iloc[0]), index=sub_index_labels, index_constructor=index_constructor, own_data=True) else: sub_frame = Frame(sub._extract( row_key=None, column_key=data_fields_iloc), index=sub_index_labels, index_constructor=index_constructor, own_data=True) else: def blocks() -> tp.Iterator[np.ndarray]: for field in data_fields_iloc: for _, func in func_map: yield sub._extract_array_column(field) sub_frame = Frame( TypeBlocks.from_blocks(blocks()), index=sub_index_labels, own_data=True, ) sub_frame = sub_frame.reindex( index_outer, own_index=True, fill_value=fill_value, ) if sub_frame.ndim == 1: sub_blocks.append(sub_frame.values) else: sub_blocks.extend(sub_frame._blocks._blocks) # type: ignore tb = TypeBlocks.from_blocks(sub_blocks) return frame.__class__( tb, index=index_outer, columns=columns_constructor(sub_columns_collected), own_data=True, own_index=True, own_columns=True, )