def _extract_labels(mapping: tp.Optional[tp.Dict[tp.Hashable, int]], labels: tp.Iterable[tp.Hashable], dtype: tp.Optional[np.dtype] = None) -> np.ndarray: '''Derive labels, a cache of the mapping keys in a sequence type (either an ndarray or a list). If the labels passed at instantiation are an ndarray, they are used after immutable filtering. Otherwise, the mapping keys are used to create an ndarray. This method is overridden in the derived class. Args: mapping: Can be None if loc_is_iloc. labels: might be an expired Generator, but if it is an immutable ndarray, we can use it without a copy. ''' # pre-fetching labels for faster get_item construction if isinstance(labels, np.ndarray): if dtype is not None and dtype != labels.dtype: raise ErrorInitIndex('invalid label dtype for this Index') return immutable_filter(labels) if hasattr(labels, '__len__'): # not a generator, not an array # resolving the dtype is expensive, pass if possible if len(labels) == 0: #type: ignore labels = EMPTY_ARRAY else: labels, _ = iterable_to_array_1d(labels, dtype=dtype) else: # labels may be an expired generator, must use the mapping if len(mapping) == 0: #type: ignore labels = EMPTY_ARRAY else: labels, _ = iterable_to_array_1d(mapping, dtype=dtype) #type: ignore # all arrays are immutable # assert labels.flags.writeable == False return labels
def test_iterable_to_array_a(self, array: np.ndarray) -> None: values = array.tolist() post, _ = util.iterable_to_array_1d(values) self.assertAlmostEqualValues(post, values) # explicitly giving object dtype post, _ = util.iterable_to_array_1d(values, dtype=util.DTYPE_OBJECT) self.assertAlmostEqualValues(post, values)
def to_series_values( self, values: tp.Iterator[tp.Any], *, dtype: DtypeSpecifier, name: NameType = None, index_constructor: tp.Optional[IndexConstructor] = None, axis: int = 0, ) -> 'Series': from static_frame.core.series import Series # Creating a Series that will have the same index as source container if self._container._NDIM == 2 and axis == 0: index = self._container._columns #type: ignore own_index = False else: index = self._container._index own_index = True if index_constructor is not None: index = index_constructor(index) # PERF: passing count here permits faster generator realization values, _ = iterable_to_array_1d( values, count=index.shape[0], dtype=dtype, ) return Series( values, name=name, index=index, own_index=own_index, )
def gen() -> tp.Iterator[np.ndarray]: for b, dtype in zip(arrays, dtypes): if dtype is None: array, _ = iterable_to_array_1d(b) else: array = np.array(b, dtype=dtype) array.flags.writeable = False yield array
def to_index_labels( self, values: tp.Iterator[tp.Hashable], #pylint: disable=function-redefined dtype: DtypeSpecifier = None, name: NameType = None, index_constructor: tp.Optional[IndexConstructor] = None, ) -> np.ndarray: # NOTE: name argument is for common interface if index_constructor is not None: raise RuntimeError( 'index_constructor not supported with this interface') # PERF: passing count here permits faster generator realization shape = self._container.shape array, _ = iterable_to_array_1d(values, count=shape[0], dtype=dtype) return array
def to_index(labels: tp.Iterable[tp.Hashable], *, default_constructor: tp.Type[IndexBase], name: NameType = None, ) -> IndexBase: '''Create and return the ``Index`` based on the array ``dtype`` ''' from static_frame.core.index_datetime import dtype_to_index_cls if labels.__class__ is not np.ndarray: # we can assume that this is 1D; returns an immutable array labels, _ = iterable_to_array_1d(labels) return dtype_to_index_cls( static=default_constructor.STATIC, dtype=labels.dtype)(labels, name=name) #type: ignore
def array_from_value_iter( key: tp.Hashable, idx: int, get_value_iter: tp.Callable[[tp.Hashable], tp.Iterator[tp.Any]], get_col_dtype: tp.Optional[tp.Callable[[int], np.dtype]], row_count: int, ) -> np.ndarray: ''' Return a single array given keys and collections. Args: get_value_iter: Iterator of a values dtypes: if an key: hashable for looking up field in `get_value_iter`. idx: integer position to extract from dtypes ''' # for each column, try to get a dtype, or None if get_col_dtype is None: dtype = None else: # dtype returned here can be None. dtype = get_col_dtype(idx) # if this value is None we cannot tell if it was explicitly None or just was not specified # NOTE: shown to be faster to try fromiter in some performance tests # values, _ = iterable_to_array_1d(get_value_iter(key), dtype=dtype) values = None if dtype is not None: try: values = np.fromiter( get_value_iter(key), count=row_count, dtype=dtype) values.flags.writeable = False except (ValueError, TypeError): # the dtype may not be compatible, so must fall back on using np.array to determine the type, i.e., ValueError: cannot convert float NaN to integer pass if values is None: # returns an immutable array values, _ = iterable_to_array_1d( get_value_iter(key), dtype=dtype ) return values
def iloc_searchsorted(self, values: tp.Any, *, side_left: bool = True, ) -> tp.Union[tp.Hashable, tp.Iterable[tp.Hashable]]: ''' {doc} Args: {values} {side_left} ''' if not isinstance(values, str) and hasattr(values, '__len__'): if not values.__class__ is np.ndarray: values, _ = iterable_to_array_1d(values) return np.searchsorted(self.values, #type: ignore [no-any-return] values, 'left' if side_left else 'right', )
def _extract_labels( mapping, labels, dtype: tp.Optional[np.dtype] = None ) -> np.ndarray: '''Derive labels, a cache of the mapping keys in a sequence type (either an ndarray or a list). If the labels passed at instantiation are an ndarray, they are used after immutable filtering. Otherwise, the mapping keys are used to create an ndarray. This method is overridden in the derived class. Args: labels: might be an expired Generator, but if it is an immutable ndarray, we can use it without a copy. ''' # pre-fetching labels for faster get_item construction if isinstance(labels, np.ndarray): if dtype is not None and dtype != labels.dtype: raise RuntimeError('invalid label dtype for this Index') return immutable_filter(labels) if hasattr(labels, '__len__'): # not a generator, not an array # resolving the detype is expensive, pass if possible labels, _ = iterable_to_array_1d(labels, dtype=dtype) else: # labels may be an expired generator, must use the mapping # NOTE: explore why this does not work # if dtype is None: # labels = np.array(list(mapping.keys()), dtype=object) # else: # labels = np.fromiter(mapping.keys(), count=len(mapping), dtype=dtype) labels_len = len(mapping) if labels_len == 0: labels = EMPTY_ARRAY else: labels = np.empty(labels_len, dtype=dtype if dtype else object) for k, v in mapping.items(): labels[v] = k labels.flags.writeable = False return labels
def pivot_items_to_frame( *, blocks: TypeBlocks, group_fields_iloc: tp.Iterable[tp.Hashable], group_depth: int, data_field_iloc: tp.Hashable, func_single: tp.Optional[UFunc], frame_cls: tp.Type['Frame'], name: NameType, dtype: np.dtype, index_constructor: IndexConstructor, columns_constructor: IndexConstructor, kind: str, ) -> 'Frame': ''' Specialized generator of pairs for when we have only one data_field and one function. This version returns a Frame. ''' from static_frame.core.series import Series group_key = group_fields_iloc if group_depth > 1 else group_fields_iloc[ 0] #type: ignore if func_single: labels = [] values = [] for label, _, v in blocks.group_extract( axis=0, key=group_key, extract=data_field_iloc, kind=kind, ): labels.append(label) values.append(func_single(v)) if dtype is None: array, _ = iterable_to_array_1d(values, count=len(values)) else: array = np.array(values, dtype=dtype) array.flags.writeable = False index = index_constructor(labels) return frame_cls.from_elements( array, index=index, own_index=True, columns=(name, ), columns_constructor=columns_constructor, ) # func_no scenario if group_depth == 1: index = index_constructor(blocks._extract_array_column(group_key)) else: index = index_constructor( tuple(label) for label in blocks._extract_array(column_key=group_key)) array = blocks._extract_array_column(data_field_iloc) return frame_cls.from_elements( array, index=index, own_index=True, columns=(name, ), columns_constructor=columns_constructor, )
def pivot_records_items_to_blocks( *, blocks: TypeBlocks, group_fields_iloc: tp.Iterable[tp.Hashable], group_depth: int, data_fields_iloc: tp.Iterable[tp.Hashable], func_single: tp.Optional[UFunc], func_map: tp.Sequence[tp.Tuple[tp.Hashable, UFunc]], func_no: bool, fill_value: tp.Any, fill_value_dtype: np.dtype, index_outer: 'IndexBase', dtypes: tp.Tuple[tp.Optional[np.dtype]], kind: str, ) -> tp.List[np.ndarray]: ''' Given a Frame and pivot parameters, perform the group by ont he group_fields and within each group, ''' # NOTE: this delivers results by label, row for use in a Frame.from_records_items constructor group_key = group_fields_iloc if group_depth > 1 else group_fields_iloc[ 0] #type: ignore arrays: tp.List[tp.Union[tp.List[tp.Any], np.ndarray]] = [] for dtype in dtypes: if dtype is None: # we can use fill_value here, as either it will be completely replaced (and not effect dtype evaluation) or be needed (and already there) arrays.append([fill_value] * len(index_outer)) else: arrays.append(np.empty(len(index_outer), dtype=dtype)) # try to use the dtype specified; fill values at end if necessary # collect all possible ilocs, and remove as observerd; if any remain, we have fill targets iloc_not_found: tp.Set[int] = set(range(len(index_outer))) # each group forms a row, each label a value in the index for label, _, part in blocks.group(axis=0, key=group_key, kind=kind): iloc: int = index_outer._loc_to_iloc(label) #type: ignore iloc_not_found.remove(iloc) if func_no: if len(part) != 1: raise RuntimeError( 'pivot requires aggregation of values; provide a `func` argument.' ) for arrays_key, column_key in enumerate(data_fields_iloc): # this is equivalent to extracting a row, but doing so would force a type consolidation arrays[arrays_key][iloc] = part._extract(0, column_key) elif func_single: for arrays_key, column_key in enumerate(data_fields_iloc): arrays[arrays_key][iloc] = func_single( part._extract_array_column(column_key)) else: arrays_key = 0 for column_key in data_fields_iloc: values = part._extract_array_column(column_key) for _, func in func_map: arrays[arrays_key][iloc] = func(values) arrays_key += 1 if iloc_not_found: # we did not fill all arrrays and have values that need to be filled # order does not matter fill_targets = list(iloc_not_found) # mutate in place then make immutable for arrays_key in range(len(arrays)): #pylint: disable=C0200 array = arrays[arrays_key] if not array.__class__ is np.ndarray: # a list array, _ = iterable_to_array_1d(array, count=len(index_outer)) arrays[arrays_key] = array # restore new array else: dtype_resolved = resolve_dtype( array.dtype, fill_value_dtype) # type: ignore if array.dtype != dtype_resolved: # type: ignore array = array.astype(dtype_resolved) #type: ignore array[fill_targets] = fill_value arrays[arrays_key] = array # re-assign new array array.flags.writeable = False # type: ignore else: for arrays_key in range(len(arrays)): #pylint: disable=C0200 array = arrays[arrays_key] if not array.__class__ is np.ndarray: # a list array, _ = iterable_to_array_1d(array, count=len(index_outer)) arrays[arrays_key] = array # re-assign new array array.flags.writeable = False return arrays
def test_iterable_to_array_b(self, labels: tp.Iterable[tp.Any]) -> None: post, _ = util.iterable_to_array_1d(labels) self.assertAlmostEqualValues(post, labels) self.assertTrue(isinstance(post, np.ndarray))