def to_bus(self) -> 'Bus': '''Realize the :obj:`Batch` as an :obj:`Bus`. Note that, as a :obj:`Bus` must have all labels (even if :obj:`Frame` are loaded lazily) ''' return Bus( Series.from_items(self.items(), name=self._name, dtype=DTYPE_OBJECT))
def from_buses( cls, buses: tp.Iterable[Bus], *, name: NameType = None, retain_labels: bool, deepcopy_from_bus: bool = False, ) -> 'Yarn': '''Return a :obj:`Yarn` from an iterable of :obj:`Bus`; labels will be drawn from :obj:`Bus.name`. ''' series = Series.from_items( ((b.name, b) for b in buses), dtype=DTYPE_OBJECT, name=name, ) hierarchy = buses_to_hierarchy( series.values, series.index, deepcopy_from_bus=deepcopy_from_bus, init_exception_cls=ErrorInitYarn, ) if retain_labels: index = hierarchy else: index = hierarchy.level_drop(1) #type: ignore return cls( series, hierarchy=hierarchy, index=index, deepcopy_from_bus=deepcopy_from_bus, )
def test_bus_max_persist_a(self) -> None: def items() -> tp.Iterator[tp.Tuple[str, Frame]]: for i in range(20): yield str(i), Frame(np.arange(i, i+10).reshape(2, 5)) s = Series.from_items(items(), dtype=object) b1 = Bus(s) config = StoreConfig( index_depth=1, columns_depth=1, include_columns=True, include_index=True ) with temp_file('.zip') as fp: b1.to_zip_pickle(fp) b2 = Bus.from_zip_pickle(fp, config=config, max_persist=3) for i in b2.index: _ = b2[i] self.assertTrue(b2._loaded.sum() <= 3) # after iteration only the last three are loaded self.assertEqual(b2._loaded.tolist(), [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, True, True])
def test_bus_max_persist_3(self) -> None: def items() -> tp.Iterator[tp.Tuple[str, Frame]]: for i in range(4): yield str(i), Frame(np.arange(i, i+10).reshape(2, 5)) s = Series.from_items(items(), dtype=object) b1 = Bus(s) config = StoreConfig( index_depth=1, columns_depth=1, include_columns=True, include_index=True ) with temp_file('.zip') as fp: b1.to_zip_pickle(fp) b2 = Bus.from_zip_pickle(fp, config=config, max_persist=4) _ = b2.iloc[[0, 1]] _ = b2.iloc[[2, 3]] self.assertTrue(b2._loaded_all) _ = b2.iloc[[1, 0]] self.assertEqual(list(b2._last_accessed.keys()), ['2', '3', '1', '0']) _ = b2.iloc[3] self.assertEqual(list(b2._last_accessed.keys()), ['2', '1', '0', '3']) _ = b2.iloc[:3] self.assertEqual(list(b2._last_accessed.keys()), ['3', '0', '1', '2'])
def to_series() -> Series: def items() -> tp.Iterator[tp.Tuple[str, tp.Any]]: yield 'platform', platform_mod.platform() yield 'sys.version', sys.version.replace('\n', '') yield 'static-frame', static_frame.__version__ # NOTE: see requirements-extras.txt for package in ( 'numpy', 'pandas', 'xlsxwriter', 'openpyxl', 'xarray', 'tables', 'pyarrow', 'msgpack', 'msgpack_numpy', ): mod = None try: mod = importlib.import_module(package) except ModuleNotFoundError: #pragma: no cover yield package, ModuleNotFoundError #pragma: no cover continue #pragma: no cover if hasattr(mod, '__version__'): yield package, mod.__version__ #type: ignore elif hasattr(mod, 'version'): # msgpack yield package, mod.version #type: ignore else: yield package, None return Series.from_items(items(), name='platform')
def test_bus_max_persist_b(self) -> None: def items() -> tp.Iterator[tp.Tuple[str, Frame]]: for i in range(20): yield str(i), Frame(np.arange(i, i+10).reshape(2, 5)) s = Series.from_items(items(), dtype=object) b1 = Bus(s) config = StoreConfig( index_depth=1, columns_depth=1, include_columns=True, include_index=True ) with temp_file('.zip') as fp: b1.to_zip_pickle(fp) b2 = Bus.from_zip_pickle(fp, config=config, max_persist=1) b3 = b2.iloc[10:] self.assertEqual(b3._loaded.sum(), 1) # only the last one is loasded self.assertEqual(b3._loaded.tolist(), [False, False, False, False, False, False, False, False, False, True] ) self.assertEqual(b3.iloc[0].sum().sum(), 145) self.assertEqual(b3._loaded.tolist(), [True, False, False, False, False, False, False, False, False, False] ) self.assertEqual(b3.iloc[4].sum().sum(), 185) self.assertEqual(b3._loaded.tolist(), [False, False, False, False, True, False, False, False, False, False] )
def to_bus(self) -> 'Bus': '''Realize the :obj:`Batch` as an :obj:`Bus`. Note that, as a :obj:`Bus` must have all labels (even if :obj:`Frame` are loaded lazily), this :obj:`Batch` will be exhausted. ''' series = Series.from_items(self.items(), name=self._name, dtype=DTYPE_OBJECT) return Bus(series, config=self._config)
def shapes(self) -> Series: '''A :obj:`Series` describing the shape of each iterated :obj:`Frame`. Returns: :obj:`tp.Tuple[int]` ''' items = ((label, f.shape) for label, f in self._items) return Series.from_items(items, name='shape', dtype=DTYPE_OBJECT)
def display(self, config: tp.Optional[DisplayConfig] = None) -> Display: config = config or DisplayActive.get() items = ((label, f.__class__) for label, f in self._items) series = Series.from_items(items, name=self._name) display_cls = Display.from_values( (), header=DisplayHeader(self.__class__, self._name), config=config) return series._display(config, display_cls)
def from_frames( cls, frames: tp.Iterable[Frame], *, config: StoreConfigMapInitializer = None, ) -> 'Bus': '''Return a ``Bus`` from an iterable of ``Frame``; labels will be drawn from :obj:`Frame.name`. ''' # could take a StoreConfigMap series = Series.from_items(((f.name, f) for f in frames), dtype=object) return cls(series, config=config)
def test_bus_max_persist_d(self) -> None: def items() -> tp.Iterator[tp.Tuple[str, Frame]]: for i in range(5): yield str(i), Frame(np.arange(i, i+10).reshape(2, 5)) s = Series.from_items(items(), dtype=object) b1 = Bus(s) config = StoreConfig( index_depth=1, columns_depth=1, include_columns=True, include_index=True ) with temp_file('.zip') as fp: b1.to_zip_pickle(fp) b2 = Bus.from_zip_pickle(fp, config=config, max_persist=3) _ = b2.iloc[[0, 2, 4]] self.assertEqual(b2._loaded.tolist(), [True, False, True, False, True]) _ = b2.iloc[[1, 2, 3]] self.assertEqual(b2._loaded.tolist(), [False, True, True, True, False]) _ = b2.iloc[4] self.assertEqual(b2._loaded.tolist(), [False, False, True, True, True]) _ = b2.iloc[0] self.assertEqual(b2._loaded.tolist(), [True, False, False, True, True]) _ = b2.iloc[[2, 3, 4]] self.assertEqual(b2._loaded.tolist(), [False, False, True, True, True]) _ = b2.iloc[[0, 1]] self.assertEqual(b2._loaded.tolist(), [True, True, False, False, True]) _ = b2.iloc[0] self.assertEqual(b2._loaded.tolist(), [True, True, False, False, True]) _ = b2.iloc[[3, 4]] self.assertEqual(b2._loaded.tolist(), [True, False, False, True, True])
def mloc(self) -> Series: '''Returns a Series of tuples of dtypes, one for each loaded Frame. ''' if not self._loaded.any(): return Series.from_element(None, index=self._series._index) def gen() -> tp.Iterator[tp.Tuple[tp.Hashable, tp.Optional[tp.Tuple[int, ...]]]]: for label, f in zip(self._series._index, self._series.values): if f is FrameDeferred: yield label, None else: yield label, tuple(f.mloc) return Series.from_items(gen())
def mloc(self) -> Series: '''Returns a :obj:`Series` showing a tuple of memory locations within each loaded Frame. ''' if not self._loaded.any(): return Series.from_element(None, index=self._index) def gen() -> tp.Iterator[tp.Tuple[tp.Hashable, tp.Optional[tp.Tuple[int, ...]]]]: for label, f in zip(self._index, self._values_mutable): if f is FrameDeferred: yield label, None else: yield label, tuple(f.mloc) return Series.from_items(gen())
def from_items(cls, pairs: tp.Iterable[tp.Tuple[tp.Hashable, Frame]], *, config: StoreConfigMapInitializer = None, name: NameType = None, index_constructor: tp.Optional[tp.Callable[..., IndexBase]] = None ) -> 'Bus': '''Return a :obj:`Bus` from an iterable of pairs of label, :obj:`Frame`. Returns: :obj:`Bus` ''' series = Series.from_items(pairs, dtype=DTYPE_OBJECT, name=name, index_constructor=index_constructor, ) return cls(series, config=config, own_data=True)
def from_frames(cls, frames: tp.Iterable[Frame], *, index_constructor: IndexConstructor = None, config: StoreConfigMapInitializer = None, name: NameType = None, ) -> 'Bus': '''Return a :obj:`Bus` from an iterable of :obj:`Frame`; labels will be drawn from :obj:`Frame.name`. ''' try: series = Series.from_items( ((f.name, f) for f in frames), dtype=DTYPE_OBJECT, name=name, index_constructor=index_constructor, ) except ErrorInitIndexNonUnique: raise ErrorInitIndexNonUnique("Frames do not have unique names.") from None return cls(series, config=config, own_data=True)
def display( self, config: tp.Optional[DisplayConfig] = None, *, style_config: tp.Optional[StyleConfig] = None, ) -> Display: '''Provide a :obj:`Series`-style display of the :obj:`Batch`. Note that if the held iterator is a generator, this display will exhaust the generator. ''' config = config or DisplayActive.get() items = ((label, f.__class__) for label, f in self._items) series = Series.from_items(items, name=self._name) display_cls = Display.from_values( (), header=DisplayHeader(self.__class__, self._name), config=config) return series._display( config, display_cls=display_cls, style_config=style_config, )
def to_series_items_group( self, pairs: tp.Iterable[tp.Tuple[tp.Hashable, tp.Any]], *, dtype: DtypeSpecifier = None, name: NameType = None, index_constructor: tp.Optional[IndexConstructor] = None, name_index: NameType = None, ) -> 'Series': from static_frame.core.series import Series from static_frame.core.index import Index # NOTE: when used on labels, this key is given; when used on labels (indices) depth_level is given; only take the key if it is a hashable (a string or a tuple, not a slice, list, or array) index_constructor = partial( Index if index_constructor is None else index_constructor, name=name_index, ) return Series.from_items(pairs=pairs, dtype=dtype, name=name, index_constructor=index_constructor)
def to_series_items( self, pairs: tp.Iterable[tp.Tuple[tp.Hashable, tp.Any]], *, dtype: DtypeSpecifier = None, name: NameType = None, index_constructor: tp.Optional[IndexConstructor] = None, axis: int = 0, ) -> 'Series': from static_frame.core.series import Series # apply_constructor should be implemented to take a pairs of label, value; only used for iter_window # axis 0 iters windows labelled by the index, axis 1 iters windows labelled by the columns if self._container._NDIM == 2 and axis == 1: index_constructor = ( index_constructor if index_constructor is not None else self._container._columns.from_labels) #type: ignore name_index = self._container._columns._name #type: ignore else: index_constructor = (index_constructor if index_constructor is not None else self._container._index.from_labels) name_index = self._container._index._name index_constructor_final = partial( index_constructor, name=name_index, ) # always return a Series return Series.from_items( pairs=pairs, dtype=dtype, name=name, index_constructor=index_constructor_final, )
def from_frames(cls, frames: tp.Iterable[Frame]) -> 'Bus': '''Return a ``Bus`` from an iterable of ``Frame``; labels will be drawn from :obj:`Frame.name`. ''' series = Series.from_items(((f.name, f) for f in frames), dtype=object) return cls(series)
def pivot_items_to_block( *, blocks: TypeBlocks, group_fields_iloc: tp.Iterable[tp.Hashable], group_depth: int, data_field_iloc: tp.Hashable, func_single: tp.Optional[UFunc], dtype: tp.Optional[np.dtype], fill_value: tp.Any, fill_value_dtype: np.dtype, index_outer: 'IndexBase', kind: str, ) -> np.ndarray: ''' Specialized generator of pairs for when we have only one data_field and one function. ''' from static_frame.core.series import Series group_key = group_fields_iloc if group_depth > 1 else group_fields_iloc[ 0] #type: ignore if func_single and dtype is not None: array = np.full( len(index_outer), fill_value, dtype=resolve_dtype(dtype, fill_value_dtype), ) for label, _, values in blocks.group_extract( axis=0, key=group_key, extract=data_field_iloc, kind=kind, ): array[index_outer._loc_to_iloc(label)] = func_single(values) array.flags.writeable = False return array if func_single and dtype is None: def gen() -> tp.Iterator[tp.Tuple[int, tp.Any]]: for label, _, values in blocks.group_extract( axis=0, key=group_key, extract=data_field_iloc, kind=kind, ): yield index_outer._loc_to_iloc(label), func_single(values) post = Series.from_items(gen()) if len(post) == len(index_outer): array = np.empty(len(index_outer), dtype=post.dtype) else: array = np.full( len(index_outer), fill_value, dtype=resolve_dtype(post.dtype, fill_value_dtype), ) array[post.index.values] = post.values array.flags.writeable = False return array # func_no scenario as no mapping here if group_depth == 1: labels = [ index_outer._loc_to_iloc(label) for label in blocks._extract_array_column(group_key) ] else: # NOTE: might replace _extract_array_column with an iterator of tuples labels = [ index_outer._loc_to_iloc(tuple(label)) for label in blocks._extract_array(column_key=group_key) ] values = blocks._extract_array_column(data_field_iloc) if len(values) == len(index_outer): array = np.empty(len(index_outer), dtype=dtype) else: array = np.full( len(index_outer), fill_value, dtype=resolve_dtype(values.dtype, fill_value_dtype), ) array[labels] = values array.flags.writeable = False return array
def pivot_core( *, frame: 'Frame', index_fields: tp.List[tp.Hashable], columns_fields: tp.List[tp.Hashable], data_fields: tp.List[tp.Hashable], func_fields: tp.Tuple[tp.Hashable, ...], func_single: tp.Optional[UFunc], func_map: tp.Sequence[tp.Tuple[tp.Hashable, UFunc]], fill_value: object = np.nan, index_constructor: IndexConstructor = None, ) -> 'Frame': '''Core implementation of Frame.pivot(). The Frame has already been reduced to just relevant columns, and all fields groups are normalized as lists of hashables. ''' from static_frame.core.series import Series from static_frame.core.frame import Frame data_fields_len = len(data_fields) index_depth = len(index_fields) # all are lists of hashables; get converted to lists of integers columns_loc_to_iloc = frame.columns._loc_to_iloc index_fields_iloc: tp.Sequence[int] = columns_loc_to_iloc( index_fields) #type: ignore data_fields_iloc: tp.Sequence[int] = columns_loc_to_iloc( data_fields) #type: ignore columns_fields_iloc: tp.Sequence[int] = columns_loc_to_iloc( columns_fields) #type: ignore # For data fields, we add the field name, not the field values, to the columns. columns_name = tuple(columns_fields) if data_fields_len > 1 or not columns_fields: # if no columns_fields, have to add values label columns_name = tuple(chain(*columns_fields, ('values', ))) if len(func_map) > 1: columns_name = columns_name + ('func', ) columns_depth = len(columns_name) if columns_depth == 1: columns_name = columns_name[0] # type: ignore columns_constructor = partial(frame._COLUMNS_CONSTRUCTOR, name=columns_name) else: columns_constructor = partial( frame._COLUMNS_HIERARCHY_CONSTRUCTOR.from_labels, depth_reference=columns_depth, name=columns_name) dtype_map = frame.dtypes dtypes_per_data_fields = tuple( pivot_records_dtypes( dtype_map=dtype_map, data_fields=data_fields, func_single=func_single, func_map=func_map, )) if func_single and data_fields_len == 1: dtype_single = ufunc_dtype_to_dtype(func_single, dtype_map[data_fields[0]]) #--------------------------------------------------------------------------- # first major branch: if we are only grouping be index fields if not columns_fields: # group by is only index_fields columns = data_fields if func_single else tuple( product(data_fields, func_fields)) # NOTE: at this time we do not automatically give back an IndexHierarchy when index_depth is == 1, as the order of the resultant values may not be hierarchable. name_index = index_fields[0] if index_depth == 1 else tuple( index_fields) if index_constructor: index_constructor = partial(index_constructor, name=name_index) else: index_constructor = partial(Index, name=name_index) if len(columns) == 1: # assert len(data_fields) == 1 f = frame.from_series(Series.from_items( pivot_items( blocks=frame._blocks, group_fields_iloc=index_fields_iloc, group_depth=index_depth, data_field_iloc=data_fields_iloc[0], func_single=func_single, ), name=columns[0], index_constructor=index_constructor, dtype=dtype_single, ), columns_constructor=columns_constructor) else: f = frame.from_records_items( pivot_records_items( blocks=frame._blocks, group_fields_iloc=index_fields_iloc, group_depth=index_depth, data_fields_iloc=data_fields_iloc, func_single=func_single, func_map=func_map, ), columns_constructor=columns_constructor, columns=columns, index_constructor=index_constructor, dtypes=dtypes_per_data_fields, ) # have to rename columns if derived in from_concat columns_final = (f.columns.rename(columns_name) if columns_depth == 1 else columns_constructor(f.columns)) return f.relabel(columns=columns_final) #type: ignore #--------------------------------------------------------------------------- # second major branch: we are only grouping be index and columns fields # avoid doing a multi-column-style selection if not needed if len(columns_fields) == 1: # columns_group = columns_fields[0] retuple_group_label = True else: # columns_group = columns_fields retuple_group_label = False columns_loc_to_iloc = frame.columns._loc_to_iloc # group by on 1 or more columns fields # NOTE: explored doing one group on index and coluns that insert into pre-allocated arrays, but that proved slower than this approach group_key = columns_fields_iloc if len( columns_fields_iloc) > 1 else columns_fields_iloc[0] index_outer = pivot_outer_index( frame=frame, index_fields=index_fields, index_depth=index_depth, index_constructor=index_constructor, ) # collect subframes based on an index of tuples and columns of tuples (if depth > 1) sub_blocks = [] sub_columns_collected: tp.List[tp.Hashable] = [] # for group, sub in frame.iter_group_items(columns_group): for group, _, sub in frame._blocks.group(axis=0, key=group_key): # derive the column fields represented by this group sub_columns = extrapolate_column_fields( columns_fields, group if not retuple_group_label else (group, ), data_fields, func_fields) sub_columns_collected.extend(sub_columns) # sub is TypeBlocks unique value in columns_group; this may or may not have unique index fields; if not, it needs to be aggregated if index_depth == 1: sub_index_labels = sub._extract_array_column(index_fields_iloc[0]) sub_index_labels_unique = ufunc_unique(sub_index_labels) else: # match to an index of tuples; the order might not be the same as IH # NOTE: might be able to keep arays and concat below sub_index_labels = tuple( zip(*(sub._extract_array_column(columns_loc_to_iloc(f)) for f in index_fields))) sub_index_labels_unique = set(sub_index_labels) sub_frame: tp.Union[Frame, Series] # if sub_index_labels are not unique we need to aggregate if len(sub_index_labels_unique) != len(sub_index_labels): # if sub_columns length is 1, that means that we only need to extract one column out of the sub Frame if len(sub_columns) == 1: assert len(data_fields) == 1 # NOTE: grouping on index_fields; can pre-process array_to_groups_and_locations sub_frame = Series.from_items( pivot_items( blocks=sub, group_fields_iloc=index_fields_iloc, group_depth=index_depth, data_field_iloc=data_fields_iloc[0], func_single=func_single, ), dtype=dtype_single, ) else: sub_frame = Frame.from_records_items( pivot_records_items(blocks=sub, group_fields_iloc=index_fields_iloc, group_depth=index_depth, data_fields_iloc=data_fields_iloc, func_single=func_single, func_map=func_map), dtypes=dtypes_per_data_fields, ) else: # we have unique values per index item, but may not have a complete index if func_single: # NOTE: should apply function even with func_single if len(data_fields) == 1: sub_frame = Frame(sub._extract_array_column( data_fields_iloc[0]), index=sub_index_labels, index_constructor=index_constructor, own_data=True) else: sub_frame = Frame(sub._extract( row_key=None, column_key=data_fields_iloc), index=sub_index_labels, index_constructor=index_constructor, own_data=True) else: def blocks() -> tp.Iterator[np.ndarray]: for field in data_fields_iloc: for _, func in func_map: yield sub._extract_array_column(field) sub_frame = Frame( TypeBlocks.from_blocks(blocks()), index=sub_index_labels, own_data=True, ) sub_frame = sub_frame.reindex( index_outer, own_index=True, fill_value=fill_value, ) if sub_frame.ndim == 1: sub_blocks.append(sub_frame.values) else: sub_blocks.extend(sub_frame._blocks._blocks) # type: ignore tb = TypeBlocks.from_blocks(sub_blocks) return frame.__class__( tb, index=index_outer, columns=columns_constructor(sub_columns_collected), own_data=True, own_index=True, own_columns=True, )