def _init_values(self, data, init_index=True, init_columns=True, **kwargs): if isinstance(data, pd.DataFrame): self.empty = data.empty self._init_csr(sparse.csr_matrix(data.values)) if init_index: self._index = _ensure_index(data.index) else: warnings.warn( "Passed index explicitly while initializing " "from pd.DataFrame. Original DataFrame's index " "will be ignored.", SyntaxWarning) if init_columns: self._columns = _ensure_index(data.columns) else: warnings.warn( "Passed columns explicitly while initializing " "from pd.DataFrame. Original DataFrame's columns" " will be ignored.", SyntaxWarning) elif _is_empty(data): self.empty = True self._data = sparse.csr_matrix( (len(self.index), len(self.columns))) self.shape = self._data.shape else: sparse_data = sparse.csr_matrix(data, **kwargs) self._init_csr(sparse_data)
def _init_values(self, data, kwargs): if isinstance(data, pd.DataFrame): self.empty = data.empty self._init_csr(sparse.csr_matrix(data.values)) self._index = _ensure_index(data.index) self._columns = _ensure_index(data.columns) elif _is_empty(data): self.empty = True self._data = sparse.csr_matrix( (len(self.index), len(self.columns))) self.shape = self._data.shape else: sparse_data = sparse.csr_matrix(data, **kwargs) self._init_csr(sparse_data)
def get_indexer(self, target, method=None, limit=None, tolerance=None): self._check_method(method) target = _ensure_index(target) target = self._maybe_cast_indexed(target) if self.equals(target): return np.arange(len(self), dtype='intp') if self.is_non_overlapping_monotonic: start, stop = self._find_non_overlapping_monotonic_bounds(target) start_plus_one = start + 1 if not ((start_plus_one < stop).any()): return np.where(start_plus_one == stop, start, -1) if not self.is_unique: raise ValueError("cannot handle non-unique indices") # IntervalIndex if isinstance(target, IntervalIndex): indexer = self._get_reindexer(target) # non IntervalIndex else: indexer = np.concatenate([self.get_loc(i) for i in target]) return _ensure_platform_int(indexer)
def reindex(self, target, method=None, level=None, limit=None, tolerance=None): """ Create index with target's values (move/add/delete values as necessary) Returns ------- new_index : pd.Index Resulting index indexer : np.ndarray or None Indices of output values in original index """ if method is not None: raise NotImplementedError("argument method is not implemented for " "CategoricalIndex.reindex") if level is not None: raise NotImplementedError("argument level is not implemented for " "CategoricalIndex.reindex") if limit is not None: raise NotImplementedError("argument limit is not implemented for " "CategoricalIndex.reindex") target = ibase._ensure_index(target) if not is_categorical_dtype(target) and not target.is_unique: raise ValueError("cannot reindex with a non-unique indexer") indexer, missing = self.get_indexer_non_unique(np.array(target)) new_target = self.take(indexer) # filling in missing if needed if len(missing): cats = self.categories.get_indexer(target) if (cats == -1).any(): # coerce to a regular index here! result = Index(np.array(self), name=self.name) new_target, indexer, _ = result._reindex_non_unique( np.array(target)) else: codes = new_target.codes.copy() codes[indexer == -1] = cats[missing] new_target = self._create_from_codes(codes) # we always want to return an Index type here # to be consistent with .reindex for other index types (e.g. they don't # coerce based on the actual values, only on the dtype) # unless we had an inital Categorical to begin with # in which case we are going to conform to the passed Categorical new_target = np.asarray(new_target) if is_categorical_dtype(target): new_target = target._shallow_copy(new_target, name=self.name) else: new_target = Index(new_target, name=self.name) return new_target, indexer
def get_indexer(self, target, method=None, limit=None, tolerance=None): method = missing.clean_reindex_fill_method(method) target = ibase._ensure_index(target) if self.equals(target): return np.arange(len(self), dtype='intp') if method == 'pad' or method == 'backfill': raise NotImplementedError("method='pad' and method='backfill' not " "implemented yet for CategoricalIndex") elif method == 'nearest': raise NotImplementedError("method='nearest' not implemented yet " 'for CategoricalIndex') if (isinstance(target, CategoricalIndex) and self.values.is_dtype_equal(target)): # we have the same codes codes = target.codes else: if isinstance(target, CategoricalIndex): target = target.categories codes = self.categories.get_indexer(target) indexer, _ = self._engine.get_indexer_non_unique(codes) return _ensure_platform_int(indexer)
def set_index(self, column=None, idx=None, level=None, inplace=False): """Set index from array, column or existing multi-index level. Parameters ---------- column: str set index from existing column in data. idx: pd.Index, np.array Set the index directly with a pandas index object or array level: int set index from a multiindex level. useful for groupbys. inplace: bool perform data transformation inplace Returns ------- sf: sp.SparseFrame | None the transformed sparse frame or None if inplace was True """ if column is None and idx is None and level is None: raise ValueError("Either column, idx or level should not be None") elif idx is not None: assert len(idx) == self.data.shape[0] new_idx = idx elif level is not None and \ isinstance(self._index, pd.MultiIndex): new_idx = self.index.get_level_values(level) elif column is not None: new_idx = np.asarray(self.loc[:, column].data.todense()).reshape(-1) if inplace: self._index = _ensure_index(new_idx) else: return SparseFrame(self.data, index=new_idx, columns=self.columns)
def _as_like_interval_index(self, other, error_msg): self._assert_can_do_setop(other) other = _ensure_index(other) if (not isinstance(other, IntervalIndex) or self.closed != other.closed): raise ValueError(error_msg) return other
def get_indexer_non_unique(self, target): target = ibase._ensure_index(target) if isinstance(target, CategoricalIndex): target = target.categories codes = self.categories.get_indexer(target) return self._engine.get_indexer_non_unique(codes)
def get_indexer_non_unique(self, target): """ this is the same for a CategoricalIndex for get_indexer; the API returns the missing values as well """ target = ibase._ensure_index(target) if isinstance(target, CategoricalIndex): target = target.categories codes = self.categories.get_indexer(target) return self._engine.get_indexer_non_unique(codes)
def __init__(self, data, index=None, columns=None, **kwargs): if len(data.shape) > 2: raise ValueError("Only two dimensional data supported") if len(data.shape) == 1 and isinstance(data, pd.Series): data = data.to_frame() elif len(data.shape) == 1: data = data.reshape(-1, 1) self.empty = False N, K = data.shape if index is None: self._index = _default_index(N) else: # assert len(index) == N self._index = _ensure_index(index) if columns is None: self._columns = _default_index(K) else: # assert len(columns) == K self._columns = _ensure_index(columns) if not sparse.isspmatrix_csr(data): try: self._init_values(data, kwargs) except TypeError: raise TypeError(traceback.format_exc() + "\nThe error described above occurred while " "converting data to sparse matrix.") else: self._init_csr(data) # register indexers self.ndim = 2 self.iloc = _CsrILocationIndexer(self, 'iloc') self.loc = _CsrLocIndexer(self, 'loc')
def get_indexer(self, target, method=None, limit=None, tolerance=None): target = _ensure_index(target) if hasattr(target, 'freq') and target.freq != self.freq: msg = _DIFFERENT_FREQ_INDEX.format(self.freqstr, target.freqstr) raise IncompatibleFrequency(msg) if isinstance(target, PeriodIndex): target = target.asi8 if tolerance is not None: tolerance = self._convert_tolerance(tolerance) return Index.get_indexer(self._int64index, target, method, limit, tolerance)
def _get_combined_index(indexes, intersect=False): # TODO: handle index names! indexes = _get_distinct_indexes(indexes) if len(indexes) == 0: return Index([]) if len(indexes) == 1: return indexes[0] if intersect: index = indexes[0] for other in indexes[1:]: index = index.intersection(other) return index union = _union_indexes(indexes) return _ensure_index(union)
def _simple_new(cls, left, right, closed=None, name=None, copy=False, verify_integrity=True): result = IntervalMixin.__new__(cls) if closed is None: closed = 'right' left = _ensure_index(left, copy=copy) right = _ensure_index(right, copy=copy) # coerce dtypes to match if needed if is_float_dtype(left) and is_integer_dtype(right): right = right.astype(left.dtype) if is_float_dtype(right) and is_integer_dtype(left): left = left.astype(right.dtype) if type(left) != type(right): raise ValueError("must not have differing left [{}] " "and right [{}] types".format( type(left), type(right))) if isinstance(left, ABCPeriodIndex): raise ValueError("Period dtypes are not supported, " "use a PeriodIndex instead") result._left = left result._right = right result._closed = closed result.name = name if verify_integrity: result._validate() result._reset_identity() return result
def get_indexer(self, target, method=None, limit=None, tolerance=None): """ Compute indexer and mask for new index given the current index. The indexer should be then used as an input to ndarray.take to align the current data to the new index. The mask determines whether labels are found or not in the current index Parameters ---------- target : MultiIndex or Index (of tuples) method : {'pad', 'ffill', 'backfill', 'bfill'} pad / ffill: propagate LAST valid observation forward to next valid backfill / bfill: use NEXT valid observation to fill gap Notes ----- This is a low-level method and probably should be used at your own risk Examples -------- >>> indexer, mask = index.get_indexer(new_index) >>> new_values = cur_values.take(indexer) >>> new_values[-mask] = np.nan Returns ------- (indexer, mask) : (ndarray, ndarray) """ method = missing.clean_reindex_fill_method(method) target = ibase._ensure_index(target) if isinstance(target, CategoricalIndex): target = target.categories if method == 'pad' or method == 'backfill': raise NotImplementedError("method='pad' and method='backfill' not " "implemented yet for CategoricalIndex") elif method == 'nearest': raise NotImplementedError("method='nearest' not implemented yet " 'for CategoricalIndex') else: codes = self.categories.get_indexer(target) indexer, _ = self._engine.get_indexer_non_unique(codes) return _ensure_platform_int(indexer)
def set_index(self, column=None, idx=None, level=None, inplace=False): """Set index from array, column or existing multi-index level.""" if column is None and idx is None and level is None: raise ValueError("Either column, idx or level should not be None") elif idx is not None: assert len(idx) == self.data.shape[0] new_idx = idx elif level is not None and \ isinstance(self._index, pd.MultiIndex): new_idx = self.index.get_level_values(level) elif column is not None: new_idx = np.asarray(self[column].data.todense()).reshape(-1) if inplace: self._index = _ensure_index(new_idx) else: return SparseFrame(self.data, index=new_idx, columns=self.columns)
def get_indexer(self, target, method=None, limit=None, tolerance=None): method = missing.clean_reindex_fill_method(method) target = ibase._ensure_index(target) if isinstance(target, CategoricalIndex): target = target.categories if method == 'pad' or method == 'backfill': raise NotImplementedError("method='pad' and method='backfill' not " "implemented yet for CategoricalIndex") elif method == 'nearest': raise NotImplementedError("method='nearest' not implemented yet " 'for CategoricalIndex') else: codes = self.categories.get_indexer(target) indexer, _ = self._engine.get_indexer_non_unique(codes) return _ensure_platform_int(indexer)
def __init__(self, data, index=None, columns=None, **kwargs): """Init SparseFrame Parameters ---------- data: sparse.csr_matrix | np.ndarray | pandas.DataFrame Data to initialize matrix with. Can be one of above types, or anything accepted by sparse.csr_matrix along with the correct kwargs. index: pd.Index or array-like Index to use for resulting frame. Will default to RangeIndex if input data has no indexing information and no index provided. columns : pd.Index or array-like Column labels to use for resulting frame. Defaults like in index. """ if len(data.shape) > 2: raise ValueError("Only two dimensional data supported") if len(data.shape) == 1 and isinstance(data, pd.Series): data = data.to_frame() elif len(data.shape) == 1: data = data.reshape(-1, 1) self.empty = False N, K = data.shape if index is None: self._index = _default_index(N) elif len(index) != N and data.size: if columns is not None: implied_axis_1 = len(columns) else: implied_axis_1 = data.shape[1] raise ValueError('Shape of passed values is {},' 'indices imply {}'.format( data.shape, (len(index), implied_axis_1))) else: self._index = _ensure_index(index) if columns is None: self._columns = _default_index(K) elif len(columns) != K and data.size: if index is not None: implied_axis_0 = len(index) else: implied_axis_0 = data.shape[0] raise ValueError('Shape of passed values is {},' 'indices imply {}'.format( data.shape, (implied_axis_0, len(columns)))) else: self._columns = _ensure_index(columns) if not sparse.isspmatrix_csr(data): try: self._init_values(data, init_index=index is None, init_columns=columns is None, **kwargs) except TypeError: raise TypeError(traceback.format_exc() + "\nThe error described above occurred while " "converting data to sparse matrix.") else: self.empty = True if _is_empty(data) else False self._init_csr(data) self.ndim = 2
def get_indexer_non_unique(self, target): target = self._maybe_cast_indexed(_ensure_index(target)) return super(IntervalIndex, self).get_indexer_non_unique(target)