Exemple #1
0
    def get_indexer_non_unique(self, target):
        target = ibase._ensure_index(target)

        if isinstance(target, CategoricalIndex):
            # Indexing on codes is more efficient if categories are the same:
            if target.categories is self.categories:
                target = target.codes
                indexer, missing = self._engine.get_indexer_non_unique(target)
                return _ensure_platform_int(indexer), missing
            target = target.values

        codes = self.categories.get_indexer(target)
        indexer, missing = self._engine.get_indexer_non_unique(codes)
        return _ensure_platform_int(indexer), missing
Exemple #2
0
    def get_indexer_non_unique(self, target):
        target = ibase._ensure_index(target)

        if isinstance(target, CategoricalIndex):
            # Indexing on codes is more efficient if categories are the same:
            if target.categories is self.categories:
                target = target.codes
                indexer, missing = self._engine.get_indexer_non_unique(target)
                return _ensure_platform_int(indexer), missing
            target = target.values

        codes = self.categories.get_indexer(target)
        indexer, missing = self._engine.get_indexer_non_unique(codes)
        return _ensure_platform_int(indexer), missing
Exemple #3
0
    def _make_selectors(self):
        new_levels = self.new_index_levels

        # make the mask
        remaining_labels = self.sorted_labels[:-1]
        level_sizes = [len(x) for x in new_levels]

        comp_index, obs_ids = get_compressed_ids(remaining_labels, level_sizes)
        ngroups = len(obs_ids)

        comp_index = _ensure_platform_int(comp_index)
        stride = self.index.levshape[self.level] + self.lift
        self.full_shape = ngroups, stride

        selector = self.sorted_labels[-1] + stride * comp_index + self.lift
        mask = np.zeros(np.prod(self.full_shape), dtype=bool)
        mask.put(selector, True)

        if mask.sum() < len(self.index):
            raise ValueError('Index contains duplicate entries, '
                             'cannot reshape')

        self.group_index = comp_index
        self.mask = mask
        self.unique_groups = obs_ids
        self.compressor = comp_index.searchsorted(np.arange(ngroups))
Exemple #4
0
    def take(self, indices, axis=0, allow_fill=True,
             fill_value=None, **kwargs):
        nv.validate_take(tuple(), kwargs)
        indices = _ensure_platform_int(indices)
        left, right = self.left, self.right

        if fill_value is None:
            fill_value = self._na_value
        mask = indices == -1

        if not mask.any():
            # we won't change dtype here in this case
            # if we don't need
            allow_fill = False

        taker = lambda x: x.take(indices, allow_fill=allow_fill,
                                 fill_value=fill_value)

        try:
            new_left = taker(left)
            new_right = taker(right)
        except ValueError:

            # we need to coerce; migth have NA's in an
            # integer dtype
            new_left = taker(left.astype(float))
            new_right = taker(right.astype(float))

        return self._shallow_copy(new_left, new_right)
Exemple #5
0
def get_group_index_sorter(group_index, ngroups):
    """
    algos.groupsort_indexer implements `counting sort` and it is at least
    O(ngroups), where
        ngroups = prod(shape)
        shape = map(len, keys)
    that is, linear in the number of combinations (cartesian product) of unique
    values of groupby keys. This can be huge when doing multi-key groupby.
    np.argsort(kind='mergesort') is O(count x log(count)) where count is the
    length of the data-frame;
    Both algorithms are `stable` sort and that is necessary for correctness of
    groupby operations. e.g. consider:
        df.groupby(key)[col].transform('first')
    """
    count = len(group_index)
    alpha = 0.0  # taking complexities literally; there may be
    beta = 1.0  # some room for fine-tuning these parameters
    do_groupsort = (count > 0 and ((alpha + beta * ngroups) <
                                   (count * np.log(count))))
    if do_groupsort:
        sorter, _ = algos.groupsort_indexer(_ensure_int64(group_index),
                                            ngroups)
        return _ensure_platform_int(sorter)
    else:
        return group_index.argsort(kind='mergesort')
Exemple #6
0
    def _make_selectors(self):
        new_levels = self.new_index_levels

        # make the mask
        remaining_labels = self.sorted_labels[:-1]
        level_sizes = [len(x) for x in new_levels]

        comp_index, obs_ids = get_compressed_ids(remaining_labels, level_sizes)
        ngroups = len(obs_ids)

        comp_index = _ensure_platform_int(comp_index)
        stride = self.index.levshape[self.level] + self.lift
        self.full_shape = ngroups, stride

        selector = self.sorted_labels[-1] + stride * comp_index + self.lift
        mask = np.zeros(np.prod(self.full_shape), dtype=bool)
        mask.put(selector, True)

        if mask.sum() < len(self.index):
            raise ValueError('Index contains duplicate entries, '
                             'cannot reshape')

        self.group_index = comp_index
        self.mask = mask
        self.unique_groups = obs_ids
        self.compressor = comp_index.searchsorted(np.arange(ngroups))
Exemple #7
0
    def get_indexer(self, target, method=None, limit=None, tolerance=None):

        self._check_method(method)
        target = _ensure_index(target)
        target = self._maybe_cast_indexed(target)

        if self.equals(target):
            return np.arange(len(self), dtype='intp')

        if self.is_non_overlapping_monotonic:
            start, stop = self._find_non_overlapping_monotonic_bounds(target)

            start_plus_one = start + 1
            if not ((start_plus_one < stop).any()):
                return np.where(start_plus_one == stop, start, -1)

        if not self.is_unique:
            raise ValueError("cannot handle non-unique indices")

        # IntervalIndex
        if isinstance(target, IntervalIndex):
            indexer = self._get_reindexer(target)

        # non IntervalIndex
        else:
            indexer = np.concatenate([self.get_loc(i) for i in target])

        return _ensure_platform_int(indexer)
Exemple #8
0
    def get_indexer(self, target, method=None, limit=None, tolerance=None):

        self._check_method(method)
        target = _ensure_index(target)
        target = self._maybe_cast_indexed(target)

        if self.equals(target):
            return np.arange(len(self), dtype='intp')

        if self.is_non_overlapping_monotonic:
            start, stop = self._find_non_overlapping_monotonic_bounds(target)

            start_plus_one = start + 1
            if not ((start_plus_one < stop).any()):
                return np.where(start_plus_one == stop, start, -1)

        if not self.is_unique:
            raise ValueError("cannot handle non-unique indices")

        # IntervalIndex
        if isinstance(target, IntervalIndex):
            indexer = self._get_reindexer(target)

        # non IntervalIndex
        else:
            indexer = np.concatenate([self.get_loc(i) for i in target])

        return _ensure_platform_int(indexer)
Exemple #9
0
    def take(self, indices, axis=0, allow_fill=True,
             fill_value=None, **kwargs):
        nv.validate_take(tuple(), kwargs)
        indices = _ensure_platform_int(indices)
        left, right = self.left, self.right

        if fill_value is None:
            fill_value = self._na_value
        mask = indices == -1

        if not mask.any():
            # we won't change dtype here in this case
            # if we don't need
            allow_fill = False

        taker = lambda x: x.take(indices, allow_fill=allow_fill,
                                 fill_value=fill_value)

        try:
            new_left = taker(left)
            new_right = taker(right)
        except ValueError:

            # we need to coerce; migth have NA's in an
            # integer dtype
            new_left = taker(left.astype(float))
            new_right = taker(right.astype(float))

        return self._shallow_copy(new_left, new_right)
Exemple #10
0
def get_group_index_sorter(group_index, ngroups):
    """
    algos.groupsort_indexer implements `counting sort` and it is at least
    O(ngroups), where
        ngroups = prod(shape)
        shape = map(len, keys)
    that is, linear in the number of combinations (cartesian product) of unique
    values of groupby keys. This can be huge when doing multi-key groupby.
    np.argsort(kind='mergesort') is O(count x log(count)) where count is the
    length of the data-frame;
    Both algorithms are `stable` sort and that is necessary for correctness of
    groupby operations. e.g. consider:
        df.groupby(key)[col].transform('first')
    """
    count = len(group_index)
    alpha = 0.0  # taking complexities literally; there may be
    beta = 1.0  # some room for fine-tuning these parameters
    do_groupsort = (count > 0 and ((alpha + beta * ngroups) <
                                   (count * np.log(count))))
    if do_groupsort:
        sorter, _ = algos.groupsort_indexer(_ensure_int64(group_index),
                                            ngroups)
        return _ensure_platform_int(sorter)
    else:
        return group_index.argsort(kind='mergesort')
Exemple #11
0
    def get_indexer(self, target, method=None, limit=None, tolerance=None):
        method = missing.clean_reindex_fill_method(method)
        target = ibase._ensure_index(target)

        if self.is_unique and self.equals(target):
            return np.arange(len(self), dtype='intp')

        if method == 'pad' or method == 'backfill':
            raise NotImplementedError("method='pad' and method='backfill' not "
                                      "implemented yet for CategoricalIndex")
        elif method == 'nearest':
            raise NotImplementedError("method='nearest' not implemented yet "
                                      'for CategoricalIndex')

        if (isinstance(target, CategoricalIndex)
                and self.values.is_dtype_equal(target)):
            # we have the same codes
            codes = target.codes
        else:
            if isinstance(target, CategoricalIndex):
                code_indexer = self.categories.get_indexer(target.categories)
                codes = take_1d(code_indexer, target.codes, fill_value=-1)
            else:
                codes = self.categories.get_indexer(target)

        indexer, _ = self._engine.get_indexer_non_unique(codes)
        return _ensure_platform_int(indexer)
Exemple #12
0
    def get_indexer(self, target, method=None, limit=None, tolerance=None):
        from pandas.core.arrays.categorical import _recode_for_categories

        method = missing.clean_reindex_fill_method(method)
        target = ibase._ensure_index(target)

        if self.is_unique and self.equals(target):
            return np.arange(len(self), dtype='intp')

        if method == 'pad' or method == 'backfill':
            raise NotImplementedError("method='pad' and method='backfill' not "
                                      "implemented yet for CategoricalIndex")
        elif method == 'nearest':
            raise NotImplementedError("method='nearest' not implemented yet "
                                      'for CategoricalIndex')

        if (isinstance(target, CategoricalIndex) and
                self.values.is_dtype_equal(target)):
            if self.values.equals(target.values):
                # we have the same codes
                codes = target.codes
            else:
                codes = _recode_for_categories(target.codes,
                                               target.categories,
                                               self.values.categories)
        else:
            if isinstance(target, CategoricalIndex):
                code_indexer = self.categories.get_indexer(target.categories)
                codes = take_1d(code_indexer, target.codes, fill_value=-1)
            else:
                codes = self.categories.get_indexer(target)

        indexer, _ = self._engine.get_indexer_non_unique(codes)
        return _ensure_platform_int(indexer)
Exemple #13
0
    def take(self, indexer, allow_fill=True, fill_value=None):
        mask = indexer == -1

        indexer = _ensure_platform_int(indexer)
        out = self.values.take(indexer)
        out[mask] = self._na_value

        return type(self)(out)
Exemple #14
0
    def get_indexer_non_unique(self, target):
        target = ibase._ensure_index(target)

        if isinstance(target, CategoricalIndex):
            target = target.categories

        codes = self.categories.get_indexer(target)
        indexer, missing = self._engine.get_indexer_non_unique(codes)
        return _ensure_platform_int(indexer), missing
Exemple #15
0
    def get_indexer_non_unique(self, target):
        target = ibase._ensure_index(target)

        if isinstance(target, CategoricalIndex):
            target = target.categories

        codes = self.categories.get_indexer(target)
        indexer, missing = self._engine.get_indexer_non_unique(codes)
        return _ensure_platform_int(indexer), missing
Exemple #16
0
 def take(self, indices, axis=0, allow_fill=True,
          fill_value=None, **kwargs):
     nv.validate_take(tuple(), kwargs)
     indices = _ensure_platform_int(indices)
     taken = self._assert_take_fillable(self.codes, indices,
                                        allow_fill=allow_fill,
                                        fill_value=fill_value,
                                        na_value=-1)
     return self._create_from_codes(taken)
Exemple #17
0
    def take(self,
             indices,
             axis=0,
             allow_fill=True,
             fill_value=None,
             **kwargs):
        """
        Sparse-compatible version of ndarray.take

        Returns
        -------
        taken : ndarray
        """
        nv.validate_take(tuple(), kwargs)

        if axis:
            raise ValueError(
                "axis must be 0, input was {axis}".format(axis=axis))

        if is_integer(indices):
            # return scalar
            return self[indices]

        indices = _ensure_platform_int(indices)
        n = len(self)
        if allow_fill and fill_value is not None:
            # allow -1 to indicate self.fill_value,
            # self.fill_value may not be NaN
            if (indices < -1).any():
                msg = ('When allow_fill=True and fill_value is not None, '
                       'all indices must be >= -1')
                raise ValueError(msg)
            elif (n <= indices).any():
                msg = 'index is out of bounds for size {size}'.format(size=n)
                raise IndexError(msg)
        else:
            if ((indices < -n) | (n <= indices)).any():
                msg = 'index is out of bounds for size {size}'.format(size=n)
                raise IndexError(msg)

        indices = indices.astype(np.int32)
        if not (allow_fill and fill_value is not None):
            indices = indices.copy()
            indices[indices < 0] += n

        locs = self.sp_index.lookup_array(indices)
        indexer = np.arange(len(locs), dtype=np.int32)
        mask = locs != -1
        if mask.any():
            indexer = indexer[mask]
            new_values = self.sp_values.take(locs[mask])
        else:
            indexer = np.empty(shape=(0, ), dtype=np.int32)
            new_values = np.empty(shape=(0, ), dtype=self.sp_values.dtype)

        sp_index = _make_index(len(indices), indexer, kind=self.sp_index)
        return self._simple_new(new_values, sp_index, self.fill_value)
Exemple #18
0
    def test_transform_fast(self):

        df = DataFrame({
            'id': np.arange(100000) / 3,
            'val': np.random.randn(100000)
        })

        grp = df.groupby('id')['val']

        values = np.repeat(grp.mean().values,
                           _ensure_platform_int(grp.count().values))
        expected = pd.Series(values, index=df.index, name='val')

        result = grp.transform(np.mean)
        assert_series_equal(result, expected)

        result = grp.transform('mean')
        assert_series_equal(result, expected)

        # GH 12737
        df = pd.DataFrame(
            {
                'grouping': [0, 1, 1, 3],
                'f': [1.1, 2.1, 3.1, 4.5],
                'd': pd.date_range('2014-1-1', '2014-1-4'),
                'i': [1, 2, 3, 4]
            },
            columns=['grouping', 'f', 'i', 'd'])
        result = df.groupby('grouping').transform('first')

        dates = [
            pd.Timestamp('2014-1-1'),
            pd.Timestamp('2014-1-2'),
            pd.Timestamp('2014-1-2'),
            pd.Timestamp('2014-1-4')
        ]
        expected = pd.DataFrame(
            {
                'f': [1.1, 2.1, 2.1, 4.5],
                'd': dates,
                'i': [1, 2, 2, 4]
            },
            columns=['f', 'i', 'd'])
        assert_frame_equal(result, expected)

        # selection
        result = df.groupby('grouping')[['f', 'i']].transform('first')
        expected = expected[['f', 'i']]
        assert_frame_equal(result, expected)

        # dup columns
        df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=['g', 'a', 'a'])
        result = df.groupby('g').transform('first')
        expected = df.drop('g', axis=1)
        assert_frame_equal(result, expected)
Exemple #19
0
    def get_group_levels(self):
        if not self.compressed and len(self.groupings) == 1:
            return [self.groupings[0].result_index]

        name_list = []
        for ping, labels in zip(self.groupings, self.recons_labels):
            labels = _ensure_platform_int(labels)
            levels = ping.result_index.take(labels)

            name_list.append(levels)

        return name_list
Exemple #20
0
    def take(self, indices, axis=0, allow_fill=True,
             fill_value=None, **kwargs):
        """
        Sparse-compatible version of ndarray.take

        Returns
        -------
        taken : ndarray
        """
        nv.validate_take(tuple(), kwargs)

        if axis:
            raise ValueError("axis must be 0, input was {axis}"
                             .format(axis=axis))

        if is_integer(indices):
            # return scalar
            return self[indices]

        indices = _ensure_platform_int(indices)
        n = len(self)
        if allow_fill and fill_value is not None:
            # allow -1 to indicate self.fill_value,
            # self.fill_value may not be NaN
            if (indices < -1).any():
                msg = ('When allow_fill=True and fill_value is not None, '
                       'all indices must be >= -1')
                raise ValueError(msg)
            elif (n <= indices).any():
                msg = 'index is out of bounds for size {size}'.format(size=n)
                raise IndexError(msg)
        else:
            if ((indices < -n) | (n <= indices)).any():
                msg = 'index is out of bounds for size {size}'.format(size=n)
                raise IndexError(msg)

        indices = indices.astype(np.int32)
        if not (allow_fill and fill_value is not None):
            indices = indices.copy()
            indices[indices < 0] += n

        locs = self.sp_index.lookup_array(indices)
        indexer = np.arange(len(locs), dtype=np.int32)
        mask = locs != -1
        if mask.any():
            indexer = indexer[mask]
            new_values = self.sp_values.take(locs[mask])
        else:
            indexer = np.empty(shape=(0, ), dtype=np.int32)
            new_values = np.empty(shape=(0, ), dtype=self.sp_values.dtype)

        sp_index = _make_index(len(indices), indexer, kind=self.sp_index)
        return self._simple_new(new_values, sp_index, self.fill_value)
Exemple #21
0
    def take(self, indexer, allow_fill=True, fill_value=None):
        indexer = np.asarray(indexer)
        mask = indexer == -1

        # take on empty array not handled as desired by numpy in case of -1
        if not len(self) and mask.all():
            return type(self)([self._na_value] * len(indexer))

        indexer = _ensure_platform_int(indexer)
        out = self.values.take(indexer)
        out[mask] = self._na_value

        return type(self)(out)
Exemple #22
0
    def group_info(self):
        ngroups = self.ngroups
        obs_group_ids = np.arange(ngroups)
        rep = np.diff(np.r_[0, self.bins])

        rep = _ensure_platform_int(rep)
        if ngroups == len(self.bins):
            comp_ids = np.repeat(np.arange(ngroups), rep)
        else:
            comp_ids = np.repeat(np.r_[-1, np.arange(ngroups)], rep)

        return comp_ids.astype('int64', copy=False), \
            obs_group_ids.astype('int64', copy=False), ngroups
Exemple #23
0
    def size(self):
        """
        Compute group sizes

        """
        ids, _, ngroup = self.group_info
        ids = _ensure_platform_int(ids)
        if ngroup:
            out = np.bincount(ids[ids != -1], minlength=ngroup)
        else:
            out = ids
        return Series(out,
                      index=self.result_index,
                      dtype='int64')
Exemple #24
0
    def _reindex_index(self,
                       index,
                       method,
                       copy,
                       level,
                       fill_value=np.nan,
                       limit=None,
                       takeable=False):
        if level is not None:
            raise TypeError('Reindex by level not supported for sparse')

        if self.index.equals(index):
            if copy:
                return self.copy()
            else:
                return self

        if len(self.index) == 0:
            return self._constructor(index=index,
                                     columns=self.columns).__finalize__(self)

        indexer = self.index.get_indexer(index, method, limit=limit)
        indexer = _ensure_platform_int(indexer)
        mask = indexer == -1
        need_mask = mask.any()

        new_series = {}
        for col, series in self.iteritems():
            if mask.all():
                continue

            values = series.values
            # .take returns SparseArray
            new = values.take(indexer)
            if need_mask:
                new = new.values
                # convert integer to float if necessary. need to do a lot
                # more than that, handle boolean etc also
                new, fill_value = maybe_upcast(new, fill_value=fill_value)
                np.putmask(new, mask, fill_value)

            new_series[col] = new

        return self._constructor(
            new_series,
            index=index,
            columns=self.columns,
            default_fill_value=self._default_fill_value).__finalize__(self)
Exemple #25
0
    def _make_sorted_values_labels(self):
        v = self.level

        labs = list(self.index.labels)
        levs = list(self.index.levels)
        to_sort = labs[:v] + labs[v + 1:] + [labs[v]]
        sizes = [len(x) for x in levs[:v] + levs[v + 1:] + [levs[v]]]

        comp_index, obs_ids = get_compressed_ids(to_sort, sizes)
        ngroups = len(obs_ids)

        indexer = _algos.groupsort_indexer(comp_index, ngroups)[0]
        indexer = _ensure_platform_int(indexer)

        self.sorted_values = algos.take_nd(self.values, indexer, axis=0)
        self.sorted_labels = [l.take(indexer) for l in to_sort]
Exemple #26
0
    def _make_sorted_values_labels(self):
        v = self.level

        labs = list(self.index.labels)
        levs = list(self.index.levels)
        to_sort = labs[:v] + labs[v + 1:] + [labs[v]]
        sizes = [len(x) for x in levs[:v] + levs[v + 1:] + [levs[v]]]

        comp_index, obs_ids = get_compressed_ids(to_sort, sizes)
        ngroups = len(obs_ids)

        indexer = _algos.groupsort_indexer(comp_index, ngroups)[0]
        indexer = _ensure_platform_int(indexer)

        self.sorted_values = algos.take_nd(self.values, indexer, axis=0)
        self.sorted_labels = [l.take(indexer) for l in to_sort]
Exemple #27
0
    def test_transform_fast(self):

        df = DataFrame({'id': np.arange(100000) / 3,
                        'val': np.random.randn(100000)})

        grp = df.groupby('id')['val']

        values = np.repeat(grp.mean().values,
                           _ensure_platform_int(grp.count().values))
        expected = pd.Series(values, index=df.index, name='val')

        result = grp.transform(np.mean)
        assert_series_equal(result, expected)

        result = grp.transform('mean')
        assert_series_equal(result, expected)

        # GH 12737
        df = pd.DataFrame({'grouping': [0, 1, 1, 3], 'f': [1.1, 2.1, 3.1, 4.5],
                           'd': pd.date_range('2014-1-1', '2014-1-4'),
                           'i': [1, 2, 3, 4]},
                          columns=['grouping', 'f', 'i', 'd'])
        result = df.groupby('grouping').transform('first')

        dates = [pd.Timestamp('2014-1-1'), pd.Timestamp('2014-1-2'),
                 pd.Timestamp('2014-1-2'), pd.Timestamp('2014-1-4')]
        expected = pd.DataFrame({'f': [1.1, 2.1, 2.1, 4.5],
                                 'd': dates,
                                 'i': [1, 2, 2, 4]},
                                columns=['f', 'i', 'd'])
        assert_frame_equal(result, expected)

        # selection
        result = df.groupby('grouping')[['f', 'i']].transform('first')
        expected = expected[['f', 'i']]
        assert_frame_equal(result, expected)

        # dup columns
        df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=['g', 'a', 'a'])
        result = df.groupby('g').transform('first')
        expected = df.drop('g', axis=1)
        assert_frame_equal(result, expected)
Exemple #28
0
    def _reindex_index(self, index, method, copy, level, fill_value=np.nan,
                       limit=None, takeable=False):
        if level is not None:
            raise TypeError('Reindex by level not supported for sparse')

        if self.index.equals(index):
            if copy:
                return self.copy()
            else:
                return self

        if len(self.index) == 0:
            return self._constructor(
                index=index, columns=self.columns).__finalize__(self)

        indexer = self.index.get_indexer(index, method, limit=limit)
        indexer = _ensure_platform_int(indexer)
        mask = indexer == -1
        need_mask = mask.any()

        new_series = {}
        for col, series in self.iteritems():
            if mask.all():
                continue

            values = series.values
            # .take returns SparseArray
            new = values.take(indexer)
            if need_mask:
                new = new.values
                # convert integer to float if necessary. need to do a lot
                # more than that, handle boolean etc also
                new, fill_value = maybe_upcast(new, fill_value=fill_value)
                np.putmask(new, mask, fill_value)

            new_series[col] = new

        return self._constructor(
            new_series, index=index, columns=self.columns,
            default_fill_value=self._default_fill_value).__finalize__(self)
Exemple #29
0
def safe_sort(values, labels=None, na_sentinel=-1, assume_unique=False):
    """
    Sort ``values`` and reorder corresponding ``labels``.
    ``values`` should be unique if ``labels`` is not None.
    Safe for use with mixed types (int, str), orders ints before strs.

    .. versionadded:: 0.19.0

    Parameters
    ----------
    values : list-like
        Sequence; must be unique if ``labels`` is not None.
    labels : list_like
        Indices to ``values``. All out of bound indices are treated as
        "not found" and will be masked with ``na_sentinel``.
    na_sentinel : int, default -1
        Value in ``labels`` to mark "not found".
        Ignored when ``labels`` is None.
    assume_unique : bool, default False
        When True, ``values`` are assumed to be unique, which can speed up
        the calculation. Ignored when ``labels`` is None.

    Returns
    -------
    ordered : ndarray
        Sorted ``values``
    new_labels : ndarray
        Reordered ``labels``; returned when ``labels`` is not None.

    Raises
    ------
    TypeError
        * If ``values`` is not list-like or if ``labels`` is neither None
        nor list-like
        * If ``values`` cannot be sorted
    ValueError
        * If ``labels`` is not None and ``values`` contain duplicates.
    """
    if not is_list_like(values):
        raise TypeError("Only list-like objects are allowed to be passed to"
                        "safe_sort as values")

    if not isinstance(values, np.ndarray):

        # don't convert to string types
        dtype, _ = infer_dtype_from_array(values)
        values = np.asarray(values, dtype=dtype)

    def sort_mixed(values):
        # order ints before strings, safe in py3
        str_pos = np.array([isinstance(x, string_types) for x in values],
                           dtype=bool)
        nums = np.sort(values[~str_pos])
        strs = np.sort(values[str_pos])
        return np.concatenate([nums, np.asarray(strs, dtype=object)])

    sorter = None
    if PY3 and lib.infer_dtype(values) == 'mixed-integer':
        # unorderable in py3 if mixed str/int
        ordered = sort_mixed(values)
    else:
        try:
            sorter = values.argsort()
            ordered = values.take(sorter)
        except TypeError:
            # try this anyway
            ordered = sort_mixed(values)

    # labels:

    if labels is None:
        return ordered

    if not is_list_like(labels):
        raise TypeError("Only list-like objects or None are allowed to be"
                        "passed to safe_sort as labels")
    labels = _ensure_platform_int(np.asarray(labels))

    from pandas import Index
    if not assume_unique and not Index(values).is_unique:
        raise ValueError("values should be unique if labels is not None")

    if sorter is None:
        # mixed types
        (hash_klass, _), values = algorithms._get_data_algo(
            values, algorithms._hashtables)
        t = hash_klass(len(values))
        t.map_locations(values)
        sorter = _ensure_platform_int(t.lookup(ordered))

    reverse_indexer = np.empty(len(sorter), dtype=np.int_)
    reverse_indexer.put(sorter, np.arange(len(sorter)))

    mask = (labels < -len(values)) | (labels >= len(values)) | \
        (labels == na_sentinel)

    # (Out of bound indices will be masked with `na_sentinel` next, so we may
    # deal with them here without performance loss using `mode='wrap'`.)
    new_labels = reverse_indexer.take(labels, mode='wrap')
    np.putmask(new_labels, mask, na_sentinel)

    return ordered, _ensure_platform_int(new_labels)
Exemple #30
0
def safe_sort(values, labels=None, na_sentinel=-1, assume_unique=False):
    """
    Sort ``values`` and reorder corresponding ``labels``.
    ``values`` should be unique if ``labels`` is not None.
    Safe for use with mixed types (int, str), orders ints before strs.

    .. versionadded:: 0.19.0

    Parameters
    ----------
    values : list-like
        Sequence; must be unique if ``labels`` is not None.
    labels : list_like
        Indices to ``values``. All out of bound indices are treated as
        "not found" and will be masked with ``na_sentinel``.
    na_sentinel : int, default -1
        Value in ``labels`` to mark "not found".
        Ignored when ``labels`` is None.
    assume_unique : bool, default False
        When True, ``values`` are assumed to be unique, which can speed up
        the calculation. Ignored when ``labels`` is None.

    Returns
    -------
    ordered : ndarray
        Sorted ``values``
    new_labels : ndarray
        Reordered ``labels``; returned when ``labels`` is not None.

    Raises
    ------
    TypeError
        * If ``values`` is not list-like or if ``labels`` is neither None
        nor list-like
        * If ``values`` cannot be sorted
    ValueError
        * If ``labels`` is not None and ``values`` contain duplicates.
    """
    if not is_list_like(values):
        raise TypeError("Only list-like objects are allowed to be passed to"
                        "safe_sort as values")

    if not isinstance(values, np.ndarray):

        # don't convert to string types
        dtype, _ = infer_dtype_from_array(values)
        values = np.asarray(values, dtype=dtype)

    def sort_mixed(values):
        # order ints before strings, safe in py3
        str_pos = np.array([isinstance(x, string_types) for x in values],
                           dtype=bool)
        nums = np.sort(values[~str_pos])
        strs = np.sort(values[str_pos])
        return np.concatenate([nums, np.asarray(strs, dtype=object)])

    sorter = None
    if PY3 and lib.infer_dtype(values) == 'mixed-integer':
        # unorderable in py3 if mixed str/int
        ordered = sort_mixed(values)
    else:
        try:
            sorter = values.argsort()
            ordered = values.take(sorter)
        except TypeError:
            # try this anyway
            ordered = sort_mixed(values)

    # labels:

    if labels is None:
        return ordered

    if not is_list_like(labels):
        raise TypeError("Only list-like objects or None are allowed to be"
                        "passed to safe_sort as labels")
    labels = _ensure_platform_int(np.asarray(labels))

    from pandas import Index
    if not assume_unique and not Index(values).is_unique:
        raise ValueError("values should be unique if labels is not None")

    if sorter is None:
        # mixed types
        (hash_klass,
         _), values = algorithms._get_data_algo(values, algorithms._hashtables)
        t = hash_klass(len(values))
        t.map_locations(values)
        sorter = _ensure_platform_int(t.lookup(ordered))

    reverse_indexer = np.empty(len(sorter), dtype=np.int_)
    reverse_indexer.put(sorter, np.arange(len(sorter)))

    mask = (labels < -len(values)) | (labels >= len(values)) | \
        (labels == na_sentinel)

    # (Out of bound indices will be masked with `na_sentinel` next, so we may
    # deal with them here without performance loss using `mode='wrap'`.)
    new_labels = reverse_indexer.take(labels, mode='wrap')
    np.putmask(new_labels, mask, na_sentinel)

    return ordered, _ensure_platform_int(new_labels)
Exemple #31
0
def pivot_annual(series, freq=None):
    """
    Deprecated. Use ``pivot_table`` instead.

    Group a series by years, taking leap years into account.

    The output has as many rows as distinct years in the original series,
    and as many columns as the length of a leap year in the units corresponding
    to the original frequency (366 for daily frequency, 366*24 for hourly...).
    The fist column of the output corresponds to Jan. 1st, 00:00:00,
    while the last column corresponds to Dec, 31st, 23:59:59.
    Entries corresponding to Feb. 29th are masked for non-leap years.

    For example, if the initial series has a daily frequency, the 59th column
    of the output always corresponds to Feb. 28th, the 61st column to Mar. 1st,
    and the 60th column is masked for non-leap years.
    With a hourly initial frequency, the (59*24)th column of the output always
    correspond to Feb. 28th 23:00, the (61*24)th column to Mar. 1st, 00:00, and
    the 24 columns between (59*24) and (61*24) are masked.

    If the original frequency is less than daily, the output is equivalent to
    ``series.convert('A', func=None)``.

    Parameters
    ----------
    series : Series
    freq : string or None, default None

    Returns
    -------
    annual : DataFrame
    """

    msg = "pivot_annual is deprecated. Use pivot_table instead"
    warnings.warn(msg, FutureWarning)

    index = series.index
    year = index.year
    years = algorithms.unique1d(year)

    if freq is not None:
        freq = freq.upper()
    else:
        freq = series.index.freq

    if freq == 'D':
        width = 366
        offset = np.asarray(index.dayofyear) - 1

        # adjust for leap year
        offset[(~isleapyear(year)) & (offset >= 59)] += 1

        columns = lrange(1, 367)
        # todo: strings like 1/1, 1/25, etc.?
    elif freq in ('M', 'BM'):
        width = 12
        offset = np.asarray(index.month) - 1
        columns = lrange(1, 13)
    elif freq == 'H':
        width = 8784
        grouped = series.groupby(series.index.year)
        defaulted = grouped.apply(lambda x: x.reset_index(drop=True))
        defaulted.index = defaulted.index.droplevel(0)
        offset = np.asarray(defaulted.index)
        offset[~isleapyear(year) & (offset >= 1416)] += 24
        columns = lrange(1, 8785)
    else:
        raise NotImplementedError(freq)

    flat_index = (year - years.min()) * width + offset
    flat_index = _ensure_platform_int(flat_index)

    values = np.empty((len(years), width))
    values.fill(np.nan)
    values.put(flat_index, series.values)

    return DataFrame(values, index=years, columns=columns)