def from_pandas(cls, value: 'pandas.DataFrame', ) -> 'IndexBase': ''' Given a Pandas index, return the appropriate IndexBase derived class. ''' import pandas from static_frame.core.index_datetime import IndexDatetime from static_frame import Index from static_frame import IndexGO from static_frame import IndexHierarchy from static_frame import IndexHierarchyGO if isinstance(value, pandas.MultiIndex): # iterating over a hierarchucal index will iterate over labels name = tuple(value.names) if not cls.STATIC: return IndexHierarchyGO.from_labels(value, name=name) return IndexHierarchy.from_labels(value, name=name) elif isinstance(value, pandas.DatetimeIndex): # coming from a Pandas datetime index, in the absence of other information, the best match is a Nanosecond index if not issubclass(cls, IndexDatetime): raise ErrorInitIndex(f'cannot create a datetime Index from {cls}') if not cls.STATIC: return cls(value, name=value.name) return cls(value, name=value.name) if not cls.STATIC: return IndexGO(value, name=value.name) return Index(value, name=value.name)
def _extract_labels(mapping: tp.Optional[tp.Dict[tp.Hashable, int]], labels: tp.Iterable[tp.Hashable], dtype: tp.Optional[np.dtype] = None) -> np.ndarray: '''Derive labels, a cache of the mapping keys in a sequence type (either an ndarray or a list). If the labels passed at instantiation are an ndarray, they are used after immutable filtering. Otherwise, the mapping keys are used to create an ndarray. This method is overridden in the derived class. Args: mapping: Can be None if loc_is_iloc. labels: might be an expired Generator, but if it is an immutable ndarray, we can use it without a copy. ''' # pre-fetching labels for faster get_item construction if isinstance(labels, np.ndarray): if dtype is not None and dtype != labels.dtype: raise ErrorInitIndex('invalid label dtype for this Index') return immutable_filter(labels) if hasattr(labels, '__len__'): # not a generator, not an array # resolving the dtype is expensive, pass if possible if len(labels) == 0: #type: ignore labels = EMPTY_ARRAY else: labels, _ = iterable_to_array_1d(labels, dtype=dtype) else: # labels may be an expired generator, must use the mapping if len(mapping) == 0: #type: ignore labels = EMPTY_ARRAY else: labels, _ = iterable_to_array_1d(mapping, dtype=dtype) #type: ignore # all arrays are immutable # assert labels.flags.writeable == False return labels
def from_pandas( cls, value: 'pandas.Index', ) -> 'IndexBase': ''' Given a Pandas index, return the appropriate IndexBase derived class. ''' import pandas if not isinstance(value, pandas.Index): raise ErrorInitIndex( f'from_pandas must be called with a Pandas Index object, not: {type(value)}' ) from static_frame import Index from static_frame import IndexGO from static_frame import IndexHierarchy from static_frame import IndexHierarchyGO from static_frame import IndexNanosecond from static_frame import IndexNanosecondGO from static_frame.core.index_datetime import IndexDatetime if isinstance(value, pandas.MultiIndex): # iterating over a hierarchical index will iterate over labels name: tp.Optional[tp.Tuple[tp.Hashable, ...]] = tuple(value.names) # if not assigned Pandas returns None for all components, which will raise issue if trying to unset this index. if all(n is None for n in name): #type: ignore name = None depth = value.nlevels if not cls.STATIC: return IndexHierarchyGO.from_labels(value, name=name, depth_reference=depth) return IndexHierarchy.from_labels(value, name=name, depth_reference=depth) elif isinstance(value, pandas.DatetimeIndex): # if IndexDatetime, use cls, else use IndexNanosecond if issubclass(cls, IndexDatetime): return cls(value, name=value.name) else: if not cls.STATIC: return IndexNanosecondGO(value, name=value.name) return IndexNanosecond(value, name=value.name) if not cls.STATIC: return IndexGO(value, name=value.name) return Index(value, name=value.name)
def from_pandas(cls, value: 'pandas.Index', ) -> 'IndexBase': ''' Given a Pandas index, return the appropriate IndexBase derived class. ''' import pandas if not isinstance(value, pandas.Index): raise ErrorInitIndex(f'from_pandas must be called with a Pandas Index object, not: {type(value)}') from static_frame import Index from static_frame import IndexGO from static_frame import IndexHierarchy from static_frame import IndexHierarchyGO from static_frame import IndexNanosecond from static_frame import IndexNanosecondGO from static_frame.core.index_datetime import IndexDatetime if isinstance(value, pandas.MultiIndex): if value.has_duplicates: raise ErrorInitIndex(f'cannot create IndexHierarchy from a MultiIndex with duplicates: {value}') # iterating over a hierarchical index will iterate over labels name: tp.Optional[tp.Tuple[tp.Hashable, ...]] = tuple(value.names) # if not assigned Pandas returns None for all components, which will raise issue if trying to unset this index. if all(n is None for n in name): #type: ignore name = None hierarchy_constructor = IndexHierarchy if cls.STATIC else IndexHierarchyGO def build_index(pd_idx: pandas.Index) -> Index: # NOTE: Newer versions of pandas will not require Python date objects to live inside # a DatetimeIndex. Instead, it will be a regular Index with dtype=object. # Only numpy datetime objects are put into a DatetimeIndex. if isinstance(pd_idx, pandas.DatetimeIndex): constructor: tp.Type[Index] = IndexNanosecond else: constructor = Index if cls.STATIC: return constructor(pd_idx, name=pd_idx.name) return tp.cast(Index, constructor._MUTABLE_CONSTRUCTOR(pd_idx)) indices: tp.List[Index] = [] indexers: np.ndarray = np.empty((value.nlevels, len(value)), dtype=DTYPE_INT_DEFAULT) for i, (levels, codes) in enumerate(zip(value.levels, value.codes)): indexers[i] = codes indices.append(build_index(levels)) indexers.flags.writeable = False return hierarchy_constructor( indices=indices, indexers=indexers, name=name, ) elif isinstance(value, pandas.DatetimeIndex): # if IndexDatetime, use cls, else use IndexNanosecond if issubclass(cls, IndexDatetime): return cls(value, name=value.name) else: if not cls.STATIC: return IndexNanosecondGO(value, name=value.name) return IndexNanosecond(value, name=value.name) if not cls.STATIC: return IndexGO(value, name=value.name) return Index(value, name=value.name)
def from_labels( cls: tp.Type[IH], labels: tp.Iterable[tp.Sequence[tp.Hashable]], *, name: tp.Hashable = None, index_constructors: tp.Optional[IndexConstructors] = None, continuation_token: tp.Union[tp.Hashable, None] = CONTINUATION_TOKEN_INACTIVE ) -> IH: ''' Construct an ``IndexHierarhcy`` from an iterable of labels, where each label is tuple defining the component labels for all hierarchies. Args: labels: an iterator or generator of tuples. continuation_token: a Hashable that will be used as a token to identify when a value in a label should use the previously encountered value at the same depth. Returns: :obj:`static_frame.IndexHierarchy` ''' labels_iter = iter(labels) try: first = next(labels_iter) except StopIteration: # if iterable is empty, return empty index return cls(levels=cls._LEVEL_CONSTRUCTOR(cls._INDEX_CONSTRUCTOR( ())), name=name) depth = len(first) # minimum permitted depth is 2 if depth < 2: raise ErrorInitIndex( 'cannot create an IndexHierarchy from only one level.') if index_constructors and len(index_constructors) != depth: raise ErrorInitIndex( 'if providing index constructors, number of index constructors must equal depth of IndexHierarchy.' ) depth_max = depth - 1 depth_pre_max = depth - 2 token = object() observed_last = [token for _ in range(depth)] tree = dict() # order assumed and necessary # put first back in front for label in chain((first, ), labels_iter): current = tree # NOTE: over the life of this loop, current can be a dict or a list # each label is an iterable for d, v in enumerate(label): # print('d', d, 'v', v, 'depth_pre_max', depth_pre_max, 'depth_max', depth_max) if continuation_token is not CONTINUATION_TOKEN_INACTIVE: if v == continuation_token: # might check that observed_last[d] != token v = observed_last[d] if d < depth_pre_max: if v not in current: current[v] = dict() # order necessary else: # can only fetch this node (and not create a new node) if this is the sequential predecessor if v != observed_last[d]: raise ErrorInitIndex( 'invalid tree-form for IndexHierarchy: {} in {} cannot follow {} when {} has already been defined' .format(v, label, observed_last[d], v)) current = current[v] observed_last[d] = v elif d < depth_max: if v not in current: current[v] = list() else: # cannot just fetch this list if it is not the predecessor if v != observed_last[d]: raise ErrorInitIndex( 'invalid tree-form for IndexHierarchy: {} in {} cannot follow {} when {} has already been defined.' .format(v, label, observed_last[d], v)) current = current[v] observed_last[d] = v elif d == depth_max: # at depth max # if there are redundancies here they will be caught in index creation current.append(v) else: raise ErrorInitIndex('label exceeded expected depth', label) return cls(levels=cls._tree_to_index_level( tree, index_constructors=index_constructors), name=name)
def __init__(self, labels: IndexInitializer, *, loc_is_iloc: bool = False, name: tp.Hashable = None, dtype: DtypeSpecifier = None) -> None: self._recache: bool = False self._map: tp.Dict[tp.Hashable, int] = None positions = None # resolve the targetted labels dtype, by lookin at the class attr _DTYPE and/or the passed dtype argument if dtype is None: dtype_extract = self._DTYPE # set in some specialized Index classes else: # passed dtype is not None if self._DTYPE is not None and dtype != self._DTYPE: raise ErrorInitIndex('invalid dtype argument for this Index', dtype, self._DTYPE) # self._DTYPE is None, passed dtype is not None, use dtype dtype_extract = dtype # handle all Index subclasses # check isinstance(labels, IndexBase) if issubclass(labels.__class__, IndexBase): if labels._recache: labels._update_array_cache() if name is None and labels.name is not None: name = labels.name # immutable, so no copy necessary if labels.depth == 1: # not an IndexHierarchy if labels.STATIC: # can take the map self._map = labels._map # get a reference to the immutable arrays, even if this is an IndexGO index, we can take the cached arrays, assuming they are up to date positions = labels._positions loc_is_iloc = labels._loc_is_iloc labels = labels._labels else: # IndexHierarchy # will be a generator of tuples; already updated caches labels = array2d_to_tuples(labels._labels) elif isinstance(labels, ContainerOperand): # it is a Series or similar array = labels.values if array.ndim == 1: labels = array else: labels = array2d_to_tuples(array) # else: assume an iterable suitable for labels usage if self._DTYPE is not None: # do not need to check arrays, as will and checked to match dtype_extract in _extract_labels if not isinstance(labels, np.ndarray): # for now, assume that if _DTYPE is defined, we have a date labels = (to_datetime64(v, dtype_extract) for v in labels) else: # coerce to target type labels = labels.astype(dtype_extract) self._name = name if name is None else name_filter(name) if self._map is None: self._map = self._get_map(labels, positions) # this might be NP array, or a list, depending on if static or grow only; if an array, dtype will be compared with passed dtype_extract self._labels = self._extract_labels(self._map, labels, dtype_extract) self._positions = self._extract_positions(self._map, positions) if self._DTYPE and self._labels.dtype != self._DTYPE: raise ErrorInitIndex('invalid label dtype for this Index', self._labels.dtype, self._DTYPE) if len(self._map) != len(self._labels): raise ErrorInitIndex( f'labels ({len(self._labels)}) have non-unique values ({len(self._map)})' ) # NOTE: automatic discovery is possible, but not yet implemented self._loc_is_iloc = loc_is_iloc
def __init__(self, labels: IndexInitializer, *, loc_is_iloc: bool = False, name: NameType = NAME_DEFAULT, dtype: DtypeSpecifier = None) -> None: self._recache: bool = False self._map: tp.Optional[FrozenAutoMap] = None positions = None is_typed = self._DTYPE is not None # resolve the targetted labels dtype, by lookin at the class attr _DTYPE and/or the passed dtype argument if dtype is None: dtype_extract = self._DTYPE # set in some specialized Index classes else: # passed dtype is not None if is_typed and dtype != self._DTYPE: # NOTE: should never get to this branch, as derived Index classes that set _DTYPE remove dtype from __init__ raise ErrorInitIndex('invalid dtype argument for this Index', dtype, self._DTYPE) #pragma: no cover # self._DTYPE is None, passed dtype is not None, use dtype dtype_extract = dtype #----------------------------------------------------------------------- # handle all Index subclasses if isinstance(labels, IndexBase): if labels._recache: labels._update_array_cache() if name is NAME_DEFAULT: name = labels.name # immutable, so no copy necessary if isinstance(labels, Index): # not an IndexHierarchy if (labels.STATIC and self.STATIC and dtype is None): if not is_typed or (is_typed and self._DTYPE == labels.dtype): # can take the map if static and if types in the dict are the same as those in the labels (or to become the labels after conversion) self._map = labels._map # get a reference to the immutable arrays, even if this is an IndexGO index, we can take the cached arrays, assuming they are up to date; for datetime64 indices, we might need to translate to a different type positions = labels._positions loc_is_iloc = labels._map is None labels = labels._labels else: # IndexHierarchy # will be a generator of tuples; already updated caches labels = array2d_to_tuples(labels.__iter__()) elif isinstance(labels, ContainerOperand): # it is a Series or similar array = labels.values if array.ndim == 1: labels = array else: labels = array2d_to_tuples(array) # else: assume an iterable suitable for labels usage #----------------------------------------------------------------------- if is_typed: # do not need to check arrays, as will and checked to match dtype_extract in _extract_labels if not isinstance(labels, np.ndarray): # for now, assume that if _DTYPE is defined, we have a date labels = (to_datetime64(v, dtype_extract) for v in labels) # coerce to target type elif labels.dtype != dtype_extract: labels = labels.astype(dtype_extract) labels.flags.writeable = False #type: ignore self._name = None if name is NAME_DEFAULT else name_filter(name) if self._map is None: # if _map not shared from another Index if not loc_is_iloc: try: self._map = FrozenAutoMap( labels) if self.STATIC else AutoMap(labels) except ValueError: # Automap will raise ValueError of non-unique values are encountered pass if self._map is None: raise ErrorInitIndex( f'labels ({len(tuple(labels))}) have non-unique values ({len(set(labels))})' ) size = len(self._map) else: # must assume labels are unique # labels must not be a generator, but we assume that internal clients that provided loc_is_iloc will not give a generator size = len(labels) #type: ignore if positions is None: positions = PositionsAllocator.get(size) else: # map shared from another Index size = len(self._map) # this might be NP array, or a list, depending on if static or grow only; if an array, dtype will be compared with passed dtype_extract self._labels = self._extract_labels(self._map, labels, dtype_extract) self._positions = self._extract_positions(size, positions) if self._DTYPE and self._labels.dtype != self._DTYPE: raise ErrorInitIndex( 'invalid label dtype for this Index', #pragma: no cover self._labels.dtype, self._DTYPE)