def _init_values(self, data, init_index=True, init_columns=True, **kwargs):
     if isinstance(data, pd.DataFrame):
         self.empty = data.empty
         self._init_csr(sparse.csr_matrix(data.values))
         if init_index:
             self._index = _ensure_index(data.index)
         else:
             warnings.warn(
                 "Passed index explicitly while initializing "
                 "from pd.DataFrame. Original DataFrame's index "
                 "will be ignored.", SyntaxWarning)
         if init_columns:
             self._columns = _ensure_index(data.columns)
         else:
             warnings.warn(
                 "Passed columns explicitly while initializing "
                 "from pd.DataFrame. Original DataFrame's columns"
                 " will be ignored.", SyntaxWarning)
     elif _is_empty(data):
         self.empty = True
         self._data = sparse.csr_matrix(
             (len(self.index), len(self.columns)))
         self.shape = self._data.shape
     else:
         sparse_data = sparse.csr_matrix(data, **kwargs)
         self._init_csr(sparse_data)
Example #2
0
 def _init_values(self, data, kwargs):
     if isinstance(data, pd.DataFrame):
         self.empty = data.empty
         self._init_csr(sparse.csr_matrix(data.values))
         self._index = _ensure_index(data.index)
         self._columns = _ensure_index(data.columns)
     elif _is_empty(data):
         self.empty = True
         self._data = sparse.csr_matrix(
             (len(self.index), len(self.columns)))
         self.shape = self._data.shape
     else:
         sparse_data = sparse.csr_matrix(data, **kwargs)
         self._init_csr(sparse_data)
Example #3
0
    def get_indexer(self, target, method=None, limit=None, tolerance=None):

        self._check_method(method)
        target = _ensure_index(target)
        target = self._maybe_cast_indexed(target)

        if self.equals(target):
            return np.arange(len(self), dtype='intp')

        if self.is_non_overlapping_monotonic:
            start, stop = self._find_non_overlapping_monotonic_bounds(target)

            start_plus_one = start + 1
            if not ((start_plus_one < stop).any()):
                return np.where(start_plus_one == stop, start, -1)

        if not self.is_unique:
            raise ValueError("cannot handle non-unique indices")

        # IntervalIndex
        if isinstance(target, IntervalIndex):
            indexer = self._get_reindexer(target)

        # non IntervalIndex
        else:
            indexer = np.concatenate([self.get_loc(i) for i in target])

        return _ensure_platform_int(indexer)
Example #4
0
    def reindex(self, target, method=None, level=None, limit=None,
                tolerance=None):
        """
        Create index with target's values (move/add/delete values as necessary)

        Returns
        -------
        new_index : pd.Index
            Resulting index
        indexer : np.ndarray or None
            Indices of output values in original index

        """

        if method is not None:
            raise NotImplementedError("argument method is not implemented for "
                                      "CategoricalIndex.reindex")
        if level is not None:
            raise NotImplementedError("argument level is not implemented for "
                                      "CategoricalIndex.reindex")
        if limit is not None:
            raise NotImplementedError("argument limit is not implemented for "
                                      "CategoricalIndex.reindex")

        target = ibase._ensure_index(target)

        if not is_categorical_dtype(target) and not target.is_unique:
            raise ValueError("cannot reindex with a non-unique indexer")

        indexer, missing = self.get_indexer_non_unique(np.array(target))
        new_target = self.take(indexer)

        # filling in missing if needed
        if len(missing):
            cats = self.categories.get_indexer(target)

            if (cats == -1).any():
                # coerce to a regular index here!
                result = Index(np.array(self), name=self.name)
                new_target, indexer, _ = result._reindex_non_unique(
                    np.array(target))

            else:

                codes = new_target.codes.copy()
                codes[indexer == -1] = cats[missing]
                new_target = self._create_from_codes(codes)

        # we always want to return an Index type here
        # to be consistent with .reindex for other index types (e.g. they don't
        # coerce based on the actual values, only on the dtype)
        # unless we had an inital Categorical to begin with
        # in which case we are going to conform to the passed Categorical
        new_target = np.asarray(new_target)
        if is_categorical_dtype(target):
            new_target = target._shallow_copy(new_target, name=self.name)
        else:
            new_target = Index(new_target, name=self.name)

        return new_target, indexer
Example #5
0
    def get_indexer(self, target, method=None, limit=None, tolerance=None):
        method = missing.clean_reindex_fill_method(method)
        target = ibase._ensure_index(target)

        if self.equals(target):
            return np.arange(len(self), dtype='intp')

        if method == 'pad' or method == 'backfill':
            raise NotImplementedError("method='pad' and method='backfill' not "
                                      "implemented yet for CategoricalIndex")
        elif method == 'nearest':
            raise NotImplementedError("method='nearest' not implemented yet "
                                      'for CategoricalIndex')

        if (isinstance(target, CategoricalIndex) and
                self.values.is_dtype_equal(target)):
            # we have the same codes
            codes = target.codes
        else:
            if isinstance(target, CategoricalIndex):
                target = target.categories
            codes = self.categories.get_indexer(target)

        indexer, _ = self._engine.get_indexer_non_unique(codes)

        return _ensure_platform_int(indexer)
Example #6
0
    def reindex(self, target, method=None, level=None, limit=None,
                tolerance=None):
        """
        Create index with target's values (move/add/delete values as necessary)

        Returns
        -------
        new_index : pd.Index
            Resulting index
        indexer : np.ndarray or None
            Indices of output values in original index

        """

        if method is not None:
            raise NotImplementedError("argument method is not implemented for "
                                      "CategoricalIndex.reindex")
        if level is not None:
            raise NotImplementedError("argument level is not implemented for "
                                      "CategoricalIndex.reindex")
        if limit is not None:
            raise NotImplementedError("argument limit is not implemented for "
                                      "CategoricalIndex.reindex")

        target = ibase._ensure_index(target)

        if not is_categorical_dtype(target) and not target.is_unique:
            raise ValueError("cannot reindex with a non-unique indexer")

        indexer, missing = self.get_indexer_non_unique(np.array(target))
        new_target = self.take(indexer)

        # filling in missing if needed
        if len(missing):
            cats = self.categories.get_indexer(target)

            if (cats == -1).any():
                # coerce to a regular index here!
                result = Index(np.array(self), name=self.name)
                new_target, indexer, _ = result._reindex_non_unique(
                    np.array(target))

            else:

                codes = new_target.codes.copy()
                codes[indexer == -1] = cats[missing]
                new_target = self._create_from_codes(codes)

        # we always want to return an Index type here
        # to be consistent with .reindex for other index types (e.g. they don't
        # coerce based on the actual values, only on the dtype)
        # unless we had an inital Categorical to begin with
        # in which case we are going to conform to the passed Categorical
        new_target = np.asarray(new_target)
        if is_categorical_dtype(target):
            new_target = target._shallow_copy(new_target, name=self.name)
        else:
            new_target = Index(new_target, name=self.name)

        return new_target, indexer
    def set_index(self, column=None, idx=None, level=None, inplace=False):
        """Set index from array, column or existing multi-index level.

        Parameters
        ----------
        column: str
            set index from existing column in data.
        idx: pd.Index, np.array
            Set the index directly with a pandas index object or array
        level: int
            set index from a multiindex level. useful for groupbys.
        inplace: bool
            perform data transformation inplace

        Returns
        -------
        sf: sp.SparseFrame | None
            the transformed sparse frame or None if inplace was True
        """
        if column is None and idx is None and level is None:
            raise ValueError("Either column, idx or level should not be None")
        elif idx is not None:
            assert len(idx) == self.data.shape[0]
            new_idx = idx
        elif level is not None and \
                isinstance(self._index, pd.MultiIndex):
            new_idx = self.index.get_level_values(level)
        elif column is not None:
            new_idx = np.asarray(self.loc[:,
                                          column].data.todense()).reshape(-1)

        if inplace:
            self._index = _ensure_index(new_idx)
        else:
            return SparseFrame(self.data, index=new_idx, columns=self.columns)
Example #8
0
 def _as_like_interval_index(self, other, error_msg):
     self._assert_can_do_setop(other)
     other = _ensure_index(other)
     if (not isinstance(other, IntervalIndex)
             or self.closed != other.closed):
         raise ValueError(error_msg)
     return other
Example #9
0
    def get_indexer_non_unique(self, target):
        target = ibase._ensure_index(target)

        if isinstance(target, CategoricalIndex):
            target = target.categories

        codes = self.categories.get_indexer(target)
        return self._engine.get_indexer_non_unique(codes)
Example #10
0
    def get_indexer_non_unique(self, target):
        target = ibase._ensure_index(target)

        if isinstance(target, CategoricalIndex):
            target = target.categories

        codes = self.categories.get_indexer(target)
        return self._engine.get_indexer_non_unique(codes)
Example #11
0
    def get_indexer_non_unique(self, target):
        """ this is the same for a CategoricalIndex for get_indexer; the API
        returns the missing values as well
        """
        target = ibase._ensure_index(target)

        if isinstance(target, CategoricalIndex):
            target = target.categories

        codes = self.categories.get_indexer(target)
        return self._engine.get_indexer_non_unique(codes)
Example #12
0
    def get_indexer_non_unique(self, target):
        """ this is the same for a CategoricalIndex for get_indexer; the API
        returns the missing values as well
        """
        target = ibase._ensure_index(target)

        if isinstance(target, CategoricalIndex):
            target = target.categories

        codes = self.categories.get_indexer(target)
        return self._engine.get_indexer_non_unique(codes)
Example #13
0
    def __init__(self, data, index=None, columns=None, **kwargs):
        if len(data.shape) > 2:
            raise ValueError("Only two dimensional data supported")

        if len(data.shape) == 1 and isinstance(data, pd.Series):
            data = data.to_frame()

        elif len(data.shape) == 1:
            data = data.reshape(-1, 1)

        self.empty = False
        N, K = data.shape

        if index is None:
            self._index = _default_index(N)
        else:
            # assert len(index) == N
            self._index = _ensure_index(index)

        if columns is None:
            self._columns = _default_index(K)
        else:
            # assert len(columns) == K
            self._columns = _ensure_index(columns)

        if not sparse.isspmatrix_csr(data):
            try:
                self._init_values(data, kwargs)
            except TypeError:
                raise TypeError(traceback.format_exc() +
                                "\nThe error described above occurred while "
                                "converting data to sparse matrix.")
        else:
            self._init_csr(data)

        # register indexers
        self.ndim = 2
        self.iloc = _CsrILocationIndexer(self, 'iloc')
        self.loc = _CsrLocIndexer(self, 'loc')
Example #14
0
    def get_indexer(self, target, method=None, limit=None, tolerance=None):
        target = _ensure_index(target)

        if hasattr(target, 'freq') and target.freq != self.freq:
            msg = _DIFFERENT_FREQ_INDEX.format(self.freqstr, target.freqstr)
            raise IncompatibleFrequency(msg)

        if isinstance(target, PeriodIndex):
            target = target.asi8

        if tolerance is not None:
            tolerance = self._convert_tolerance(tolerance)
        return Index.get_indexer(self._int64index, target, method, limit,
                                 tolerance)
Example #15
0
def _get_combined_index(indexes, intersect=False):
    # TODO: handle index names!
    indexes = _get_distinct_indexes(indexes)
    if len(indexes) == 0:
        return Index([])
    if len(indexes) == 1:
        return indexes[0]
    if intersect:
        index = indexes[0]
        for other in indexes[1:]:
            index = index.intersection(other)
        return index
    union = _union_indexes(indexes)
    return _ensure_index(union)
Example #16
0
def _get_combined_index(indexes, intersect=False):
    # TODO: handle index names!
    indexes = _get_distinct_indexes(indexes)
    if len(indexes) == 0:
        return Index([])
    if len(indexes) == 1:
        return indexes[0]
    if intersect:
        index = indexes[0]
        for other in indexes[1:]:
            index = index.intersection(other)
        return index
    union = _union_indexes(indexes)
    return _ensure_index(union)
Example #17
0
    def get_indexer(self, target, method=None, limit=None, tolerance=None):
        target = _ensure_index(target)

        if hasattr(target, 'freq') and target.freq != self.freq:
            msg = _DIFFERENT_FREQ_INDEX.format(self.freqstr, target.freqstr)
            raise IncompatibleFrequency(msg)

        if isinstance(target, PeriodIndex):
            target = target.asi8

        if tolerance is not None:
            tolerance = self._convert_tolerance(tolerance)
        return Index.get_indexer(self._int64index, target, method,
                                 limit, tolerance)
Example #18
0
    def _simple_new(cls,
                    left,
                    right,
                    closed=None,
                    name=None,
                    copy=False,
                    verify_integrity=True):
        result = IntervalMixin.__new__(cls)

        if closed is None:
            closed = 'right'
        left = _ensure_index(left, copy=copy)
        right = _ensure_index(right, copy=copy)

        # coerce dtypes to match if needed
        if is_float_dtype(left) and is_integer_dtype(right):
            right = right.astype(left.dtype)
        if is_float_dtype(right) and is_integer_dtype(left):
            left = left.astype(right.dtype)

        if type(left) != type(right):
            raise ValueError("must not have differing left [{}] "
                             "and right [{}] types".format(
                                 type(left), type(right)))

        if isinstance(left, ABCPeriodIndex):
            raise ValueError("Period dtypes are not supported, "
                             "use a PeriodIndex instead")

        result._left = left
        result._right = right
        result._closed = closed
        result.name = name
        if verify_integrity:
            result._validate()
        result._reset_identity()
        return result
Example #19
0
    def get_indexer(self, target, method=None, limit=None, tolerance=None):
        """
        Compute indexer and mask for new index given the current index. The
        indexer should be then used as an input to ndarray.take to align the
        current data to the new index. The mask determines whether labels are
        found or not in the current index

        Parameters
        ----------
        target : MultiIndex or Index (of tuples)
        method : {'pad', 'ffill', 'backfill', 'bfill'}
            pad / ffill: propagate LAST valid observation forward to next valid
            backfill / bfill: use NEXT valid observation to fill gap

        Notes
        -----
        This is a low-level method and probably should be used at your own risk

        Examples
        --------
        >>> indexer, mask = index.get_indexer(new_index)
        >>> new_values = cur_values.take(indexer)
        >>> new_values[-mask] = np.nan

        Returns
        -------
        (indexer, mask) : (ndarray, ndarray)
        """
        method = missing.clean_reindex_fill_method(method)
        target = ibase._ensure_index(target)

        if isinstance(target, CategoricalIndex):
            target = target.categories

        if method == 'pad' or method == 'backfill':
            raise NotImplementedError("method='pad' and method='backfill' not "
                                      "implemented yet for CategoricalIndex")
        elif method == 'nearest':
            raise NotImplementedError("method='nearest' not implemented yet "
                                      'for CategoricalIndex')
        else:

            codes = self.categories.get_indexer(target)
            indexer, _ = self._engine.get_indexer_non_unique(codes)

        return _ensure_platform_int(indexer)
Example #20
0
    def set_index(self, column=None, idx=None, level=None, inplace=False):
        """Set index from array, column or existing multi-index level."""
        if column is None and idx is None and level is None:
            raise ValueError("Either column, idx or level should not be None")
        elif idx is not None:
            assert len(idx) == self.data.shape[0]
            new_idx = idx
        elif level is not None and \
                isinstance(self._index, pd.MultiIndex):
            new_idx = self.index.get_level_values(level)
        elif column is not None:
            new_idx = np.asarray(self[column].data.todense()).reshape(-1)

        if inplace:
            self._index = _ensure_index(new_idx)
        else:
            return SparseFrame(self.data, index=new_idx, columns=self.columns)
Example #21
0
    def get_indexer(self, target, method=None, limit=None, tolerance=None):
        """
        Compute indexer and mask for new index given the current index. The
        indexer should be then used as an input to ndarray.take to align the
        current data to the new index. The mask determines whether labels are
        found or not in the current index

        Parameters
        ----------
        target : MultiIndex or Index (of tuples)
        method : {'pad', 'ffill', 'backfill', 'bfill'}
            pad / ffill: propagate LAST valid observation forward to next valid
            backfill / bfill: use NEXT valid observation to fill gap

        Notes
        -----
        This is a low-level method and probably should be used at your own risk

        Examples
        --------
        >>> indexer, mask = index.get_indexer(new_index)
        >>> new_values = cur_values.take(indexer)
        >>> new_values[-mask] = np.nan

        Returns
        -------
        (indexer, mask) : (ndarray, ndarray)
        """
        method = missing.clean_reindex_fill_method(method)
        target = ibase._ensure_index(target)

        if isinstance(target, CategoricalIndex):
            target = target.categories

        if method == 'pad' or method == 'backfill':
            raise NotImplementedError("method='pad' and method='backfill' not "
                                      "implemented yet for CategoricalIndex")
        elif method == 'nearest':
            raise NotImplementedError("method='nearest' not implemented yet "
                                      'for CategoricalIndex')
        else:

            codes = self.categories.get_indexer(target)
            indexer, _ = self._engine.get_indexer_non_unique(codes)

        return _ensure_platform_int(indexer)
Example #22
0
    def get_indexer(self, target, method=None, limit=None, tolerance=None):
        method = missing.clean_reindex_fill_method(method)
        target = ibase._ensure_index(target)

        if isinstance(target, CategoricalIndex):
            target = target.categories

        if method == 'pad' or method == 'backfill':
            raise NotImplementedError("method='pad' and method='backfill' not "
                                      "implemented yet for CategoricalIndex")
        elif method == 'nearest':
            raise NotImplementedError("method='nearest' not implemented yet "
                                      'for CategoricalIndex')
        else:

            codes = self.categories.get_indexer(target)
            indexer, _ = self._engine.get_indexer_non_unique(codes)

        return _ensure_platform_int(indexer)
Example #23
0
    def get_indexer(self, target, method=None, limit=None, tolerance=None):
        method = missing.clean_reindex_fill_method(method)
        target = ibase._ensure_index(target)

        if isinstance(target, CategoricalIndex):
            target = target.categories

        if method == 'pad' or method == 'backfill':
            raise NotImplementedError("method='pad' and method='backfill' not "
                                      "implemented yet for CategoricalIndex")
        elif method == 'nearest':
            raise NotImplementedError("method='nearest' not implemented yet "
                                      'for CategoricalIndex')
        else:

            codes = self.categories.get_indexer(target)
            indexer, _ = self._engine.get_indexer_non_unique(codes)

        return _ensure_platform_int(indexer)
    def __init__(self, data, index=None, columns=None, **kwargs):
        """Init SparseFrame

        Parameters
        ----------
        data: sparse.csr_matrix | np.ndarray | pandas.DataFrame
            Data to initialize matrix with. Can be one of above types, or
            anything accepted by sparse.csr_matrix along with the correct
            kwargs.
        index: pd.Index or array-like
            Index to use for resulting frame. Will default to RangeIndex if
            input data has no indexing information and no index provided.
        columns : pd.Index or array-like
            Column labels to use for resulting frame. Defaults like in index.
        """
        if len(data.shape) > 2:
            raise ValueError("Only two dimensional data supported")

        if len(data.shape) == 1 and isinstance(data, pd.Series):
            data = data.to_frame()

        elif len(data.shape) == 1:
            data = data.reshape(-1, 1)

        self.empty = False
        N, K = data.shape

        if index is None:
            self._index = _default_index(N)
        elif len(index) != N and data.size:
            if columns is not None:
                implied_axis_1 = len(columns)
            else:
                implied_axis_1 = data.shape[1]
            raise ValueError('Shape of passed values is {},'
                             'indices imply {}'.format(
                                 data.shape, (len(index), implied_axis_1)))
        else:
            self._index = _ensure_index(index)

        if columns is None:
            self._columns = _default_index(K)
        elif len(columns) != K and data.size:
            if index is not None:
                implied_axis_0 = len(index)
            else:
                implied_axis_0 = data.shape[0]
            raise ValueError('Shape of passed values is {},'
                             'indices imply {}'.format(
                                 data.shape, (implied_axis_0, len(columns))))
        else:
            self._columns = _ensure_index(columns)

        if not sparse.isspmatrix_csr(data):
            try:
                self._init_values(data,
                                  init_index=index is None,
                                  init_columns=columns is None,
                                  **kwargs)
            except TypeError:
                raise TypeError(traceback.format_exc() +
                                "\nThe error described above occurred while "
                                "converting data to sparse matrix.")
        else:
            self.empty = True if _is_empty(data) else False
            self._init_csr(data)

        self.ndim = 2
Example #25
0
 def get_indexer_non_unique(self, target):
     target = self._maybe_cast_indexed(_ensure_index(target))
     return super(IntervalIndex, self).get_indexer_non_unique(target)