Exemple #1
0
 def _convert_arr_indexer(self, keyarr):
     # Cast the indexer to uint64 if possible so
     # that the values returned from indexing are
     # also uint64.
     keyarr = _asarray_tuplesafe(keyarr)
     if is_integer_dtype(keyarr):
         return _asarray_tuplesafe(keyarr, dtype=np.uint64)
     return keyarr
Exemple #2
0
    def drop(self, labels):
        """
        Make new MultiIndex with passed list of labels deleted

        Parameters
        ----------
        labels : array-like
            Must be a list of tuples

        Returns
        -------
        dropped : MultiIndex
        """
        try:
            if not isinstance(labels, np.ndarray):
                labels = _asarray_tuplesafe(labels)
            indexer, mask = self.get_indexer(labels)
            if not mask.all():
                raise ValueError('labels %s not contained in axis'
                                 % labels[-mask])
            return self.delete(indexer)
        except Exception:
            pass

        inds = []
        for label in labels:
            loc = self.get_loc(label)
            if isinstance(loc, int):
                inds.append(loc)
            else:
                inds.extend(range(loc.start, loc.stop))

        return self.delete(inds)
Exemple #3
0
def match(to_match, values, na_sentinel=-1):
    """
    Compute locations of to_match into values

    Parameters
    ----------
    to_match : array-like
        values to find positions of
    values : array-like
        Unique set of values
    na_sentinel : int, default -1
        Value to mark "not found"

    Examples
    --------

    Returns
    -------
    match : ndarray of integers
    """
    values = com._asarray_tuplesafe(values)
    if issubclass(values.dtype.type, string_types):
        values = np.array(values, dtype='O')

    f = lambda htype, caster: _match_generic(to_match, values, htype, caster)
    result = _hashtable_algo(f, values.dtype)

    if na_sentinel != -1:

        # replace but return a numpy array
        # use a Series because it handles dtype conversions properly
        from pandas.core.series import Series
        result = Series(result.ravel()).replace(-1,na_sentinel).values.reshape(result.shape)

    return result
Exemple #4
0
    def convert(values, unit, axis):
        def try_parse(values):
            try:
                return _dt_to_float_ordinal(tools.to_datetime(values))
            except Exception:
                return values

        if isinstance(values, (datetime, pydt.date)):
            return _dt_to_float_ordinal(values)
        elif isinstance(values, pydt.time):
            return dates.date2num(values)
        elif (com.is_integer(values) or com.is_float(values)):
            return values
        elif isinstance(values, compat.string_types):
            return try_parse(values)
        elif isinstance(values, (list, tuple, np.ndarray)):
            if not isinstance(values, np.ndarray):
                values = com._asarray_tuplesafe(values)

            if com.is_integer_dtype(values) or com.is_float_dtype(values):
                return values

            try:
                values = tools.to_datetime(values)
                if isinstance(values, Index):
                    values = values.map(_dt_to_float_ordinal)
                else:
                    values = [_dt_to_float_ordinal(x) for x in values]
            except Exception:
                pass

        return values
Exemple #5
0
def factorize(values, sort=False, order=None, na_sentinel=-1):
    """
    Encode input values as an enumerated type or categorical variable

    Parameters
    ----------
    values : sequence
    sort :
    order :

    Returns
    -------
    """
    hash_klass, values = _get_hash_table_and_cast(values)

    uniques = []
    table = hash_klass(len(values))
    labels, counts = table.get_labels(values, uniques, 0, na_sentinel)

    uniques = com._asarray_tuplesafe(uniques)
    if sort and len(counts) > 0:
        sorter = uniques.argsort()
        reverse_indexer = np.empty(len(sorter), dtype=np.int32)
        reverse_indexer.put(sorter, np.arange(len(sorter)))

        mask = labels < 0
        labels = reverse_indexer.take(labels)
        np.putmask(labels, mask, -1)

        uniques = uniques.take(sorter)
        counts = counts.take(sorter)

    return labels, uniques, counts
Exemple #6
0
    def _convert_arr_indexer(self, keyarr):
        keyarr = com._asarray_tuplesafe(keyarr)

        if self.categories._defer_to_indexing:
            return keyarr

        return self._shallow_copy(keyarr)
Exemple #7
0
def match(to_match, values, na_sentinel=-1):
    """
    Compute locations of to_match into values

    Parameters
    ----------
    to_match : array-like
        values to find positions of
    values : array-like
        Unique set of values
    na_sentinel : int, default -1
        Value to mark "not found"

    Examples
    --------

    Returns
    -------
    match : ndarray of integers
    """
    values = com._asarray_tuplesafe(values)
    if issubclass(values.dtype.type, compat.string_types):
        values = np.array(values, dtype='O')

    f = lambda htype, caster: _match_generic(to_match, values, htype, caster)
    return _hashtable_algo(f, values.dtype)
Exemple #8
0
    def _getitem_iterable(self, key, axis=0):
        labels = self.obj._get_axis(axis)

        def _reindex(keys, level=None):
            try:
                return self.obj.reindex_axis(keys, axis=axis, level=level)
            except AttributeError:
                # Series
                assert(axis == 0)
                return self.obj.reindex(keys, level=level)

        if com._is_bool_indexer(key):
            key = _check_bool_indexer(labels, key)
            return _reindex(labels[np.asarray(key)])
        else:
            if isinstance(key, Index):
                # want Index objects to pass through untouched
                keyarr = key
            else:
                # asarray can be unsafe, NumPy strings are weird
                keyarr = _asarray_tuplesafe(key)
            if _is_integer_dtype(keyarr) and not _is_integer_index(labels):
                keyarr = labels.take(keyarr)

            # this is not the most robust, but...
            if (isinstance(labels, MultiIndex) and
                not isinstance(keyarr[0], tuple)):
                level = 0
            else:
                level = None

            return _reindex(keyarr, level=level)
Exemple #9
0
    def to_tuples(self, na_tuple=True):
        """
        Return an Index of tuples of the form (left, right)

        Parameters
        ----------
        na_tuple : boolean, default True
            Returns NA as a tuple if True, ``(nan, nan)``, or just as the NA
            value itself if False, ``nan``.

            ..versionadded:: 0.23.0

        Examples
        --------
        >>>  idx = pd.IntervalIndex.from_arrays([0, np.nan, 2], [1, np.nan, 3])
        >>>  idx.to_tuples()
        Index([(0.0, 1.0), (nan, nan), (2.0, 3.0)], dtype='object')
        >>>  idx.to_tuples(na_tuple=False)
        Index([(0.0, 1.0), nan, (2.0, 3.0)], dtype='object')
        """
        tuples = _asarray_tuplesafe(zip(self.left, self.right))
        if not na_tuple:
            # GH 18756
            tuples = np.where(~self._isnan, tuples, np.nan)
        return Index(tuples)
Exemple #10
0
    def _convert_to_indexer(self, obj, axis=0):
        """
        Convert indexing key into something we can use to do actual fancy
        indexing on an ndarray

        Examples
        ix[:5] -> slice(0, 5)
        ix[[1,2,3]] -> [1,2,3]
        ix[['foo', 'bar', 'baz']] -> [i, j, k] (indices of foo, bar, baz)

        Going by Zen of Python?
        "In the face of ambiguity, refuse the temptation to guess."
        raise AmbiguousIndexError with integer labels?
        - No, prefer label-based indexing
        """
        index = self.obj._get_axis(axis)

        try:
            return index.get_loc(obj)
        except (KeyError, TypeError):
            pass

        is_int_index = _is_integer_index(index)
        if isinstance(obj, slice):
            if _is_label_slice(index, obj):
                i, j = index.slice_locs(obj.start, obj.stop)

                if obj.step is not None:
                    raise Exception('Non-zero step not supported with '
                                    'label-based slicing')
                return slice(i, j)
            else:
                return obj
        elif _is_list_like(obj):
            objarr = _asarray_tuplesafe(obj)

            if objarr.dtype == np.bool_:
                if not obj.index.equals(index):
                    raise IndexingError('Cannot use boolean index with '
                                        'misaligned or unequal labels')
                return objarr
            else:
                # If have integer labels, defer to label-based indexing
                if _is_integer_dtype(objarr) and not is_int_index:
                    return objarr

                indexer = index.get_indexer(objarr)
                mask = indexer == -1
                if mask.any():
                    raise KeyError('%s not in index' % objarr[mask])

                return indexer
        else:
            if com.is_integer(obj) and not is_int_index:
                return obj
            return index.get_loc(obj)
Exemple #11
0
def _get_grouper(obj, key=None, axis=0, level=None, sort=True):
    group_axis = obj._get_axis(axis)

    if level is not None and not isinstance(group_axis, MultiIndex):
        raise ValueError('can only specify level with multi-level index')

    if not isinstance(key, (tuple, list)):
        keys = [key]
    else:
        keys = key

    # what are we after, exactly?
    match_axis_length = len(keys) == len(group_axis)
    any_callable = any(callable(g) or isinstance(g, dict) for g in keys)
    any_arraylike = any(isinstance(g, (list, tuple, np.ndarray))
                        for g in keys)

    try:
        if isinstance(obj, DataFrame):
            all_in_columns = all(g in obj.columns for g in keys)
        else:
            all_in_columns = False
    except Exception:
        all_in_columns = False

    if (not any_callable and not all_in_columns
        and not any_arraylike and match_axis_length
        and not level):
        keys = [com._asarray_tuplesafe(keys)]

    if isinstance(level, (tuple, list)):
        if key is None:
            keys = [None] * len(level)
        levels = level
    else:
        levels = [level] * len(keys)

    groupings = []
    exclusions = []
    for i, (gpr, level) in enumerate(zip(keys, levels)):
        name = None
        if _is_label_like(gpr):
            exclusions.append(gpr)
            name = gpr
            gpr = obj[gpr]
        ping = Grouping(group_axis, gpr, name=name, level=level, sort=sort)
        if ping.name is None:
            ping.name = 'key_%d' % i
        groupings.append(ping)

    grouper = Grouper(group_axis, groupings, sort=sort)

    return grouper, exclusions
Exemple #12
0
    def get_kwargs_from_breaks(self, breaks, closed='right'):
        """
        converts intervals in breaks format to a dictionary of kwargs to
        specific to the format expected by IntervalIndex.from_tuples
        """
        if len(breaks) == 0:
            return {'data': breaks}

        tuples = lzip(breaks[:-1], breaks[1:])
        if isinstance(breaks, (list, tuple)):
            return {'data': tuples}
        return {'data': com._asarray_tuplesafe(tuples)}
Exemple #13
0
    def _getitem_iterable(self, key, axis=0):
        labels = self.obj._get_axis(axis)

        def _reindex(keys, level=None):
            try:
                return self.obj.reindex_axis(keys, axis=axis, level=level)
            except AttributeError:
                # Series
                if axis != 0:
                    raise AssertionError('axis must be 0')
                return self.obj.reindex(keys, level=level)

        if com._is_bool_indexer(key):
            key = _check_bool_indexer(labels, key)
            inds, = np.asarray(key, dtype=bool).nonzero()
            return self.obj.take(inds, axis=axis)
        else:
            was_index = isinstance(key, Index)
            if was_index:
                # want Index objects to pass through untouched
                keyarr = key
            else:
                # asarray can be unsafe, NumPy strings are weird
                keyarr = _asarray_tuplesafe(key)

            if _is_integer_dtype(keyarr):
                if labels.inferred_type != 'integer':
                    keyarr = np.where(keyarr < 0,
                                      len(labels) + keyarr, keyarr)

                if labels.inferred_type == 'mixed-integer':
                    indexer = labels.get_indexer(keyarr)
                    if (indexer >= 0).all():
                        self.obj.take(indexer, axis=axis)
                    else:
                        return self.obj.take(keyarr, axis=axis)
                elif not labels.inferred_type == 'integer':

                    return self.obj.take(keyarr, axis=axis)

            # this is not the most robust, but...
            if (isinstance(labels, MultiIndex) and
                not isinstance(keyarr[0], tuple)):
                level = 0
            else:
                level = None

            if labels.is_unique:
                return _reindex(keyarr, level=level)
            else:
                mask = labels.isin(keyarr)
                return self.obj.take(mask.nonzero()[0], axis=axis)
Exemple #14
0
    def __new__(cls, data, dtype=None, copy=False, name=None):
        if isinstance(data, np.ndarray):
            if dtype is None and issubclass(data.dtype.type, np.integer):
                return Int64Index(data, copy=copy, name=name)
            subarr = np.array(data, dtype=object, copy=copy)
        elif np.isscalar(data):
            raise ValueError("Index(...) must be called with a collection " "of some kind, %s was passed" % repr(data))
        else:
            # other iterable of some kind
            subarr = _asarray_tuplesafe(data, dtype=object)

        subarr = subarr.view(cls)
        subarr.name = name
        return subarr
Exemple #15
0
    def _prep_window(self, **kwargs):
        """ provide validation for our window type, return the window """
        window = self._get_window()

        if isinstance(window, (list, tuple, np.ndarray)):
            return com._asarray_tuplesafe(window).astype(float)
        elif com.is_integer(window):
            try:
                import scipy.signal as sig
            except ImportError:
                raise ImportError('Please install scipy to generate window weight')
            win_type = _validate_win_type(self.win_type, kwargs)  # may pop from kwargs
            return sig.get_window(win_type, window).astype(float)

        raise ValueError('Invalid window %s' % str(window))
Exemple #16
0
def unique(values):
    """
    Compute unique values (not necessarily sorted) efficiently from input array
    of values

    Parameters
    ----------
    values : array-like

    Returns
    -------
    uniques
    """
    values = com._asarray_tuplesafe(values)
    f = lambda htype, caster: _unique_generic(values, htype, caster)
    return _hashtable_algo(f, values.dtype)
Exemple #17
0
def unique(values):
    """
    Compute unique values (not necessarily sorted) efficiently from input array
    of values

    Parameters
    ----------
    values : array-like

    Returns
    -------
    uniques
    """
    values = com._asarray_tuplesafe(values)
    f = lambda htype, caster: _unique_generic(values, htype, caster)
    return _hashtable_algo(f, values.dtype)
Exemple #18
0
    def _read_panel_table(self, group, where=None):
        from pandas.core.common import _asarray_tuplesafe

        table = getattr(group, 'table')

        # create the selection
        sel = Selection(table, where)
        sel.select()
        fields = table._v_attrs.fields

        columns = _maybe_convert(sel.values['column'],
                                 table._v_attrs.columns_kind)
        index = _maybe_convert(sel.values['index'],
                               table._v_attrs.index_kind)
        # reconstruct
        long_index = MultiIndex.from_arrays([index, columns])
        lp = LongPanel(sel.values['values'], index=long_index,
                       columns=fields)

        if lp.consistent:
            lp = lp.sortlevel(level=0)
            wp = lp.to_wide()
        else:
            if not self._quiet:  # pragma: no cover
                print ('Duplicate entries in table, taking most recently '
                       'appended')

            # need a better algorithm
            tuple_index = long_index.get_tuple_index()
            index_map = lib.map_indices_object(tuple_index)

            unique_tuples = lib.fast_unique(tuple_index)
            unique_tuples = _asarray_tuplesafe(unique_tuples)

            indexer = lib.merge_indexer_object(unique_tuples, index_map)

            new_index = long_index.take(indexer)
            new_values = lp.values.take(indexer, axis=0)

            lp = LongPanel(new_values, index=new_index, columns=lp.columns)
            wp = lp.to_wide()

        if sel.column_filter:
            new_minor = sorted(set(wp.minor_axis) & sel.column_filter)
            wp = wp.reindex(minor=new_minor)
        return wp
Exemple #19
0
    def _prep_window(self, **kwargs):
        """ provide validation for our window type, return the window """
        window = self._get_window()

        if isinstance(window, (list, tuple, np.ndarray)):
            return com._asarray_tuplesafe(window).astype(float)
        elif com.is_integer(window):
            try:
                import scipy.signal as sig
            except ImportError:
                raise ImportError('Please install scipy to generate window '
                                  'weight')
            # the below may pop from kwargs
            win_type = _validate_win_type(self.win_type, kwargs)
            return sig.get_window(win_type, window).astype(float)

        raise ValueError('Invalid window %s' % str(window))
Exemple #20
0
    def _getitem_iterable(self, key, axis=0):
        labels = self.obj._get_axis(axis)

        def _reindex(keys, level=None):
            try:
                return self.obj.reindex_axis(keys, axis=axis, level=level)
            except AttributeError:
                # Series
                assert(axis == 0)
                return self.obj.reindex(keys, level=level)

        if com._is_bool_indexer(key):
            key = _check_bool_indexer(labels, key)
            inds, = np.asarray(key, dtype=bool).nonzero()
            return self.obj.take(inds, axis=axis)
        else:
            if isinstance(key, Index):
                # want Index objects to pass through untouched
                keyarr = key
            else:
                # asarray can be unsafe, NumPy strings are weird
                keyarr = _asarray_tuplesafe(key)

            if _is_integer_dtype(keyarr):
                if labels.inferred_type == 'mixed-integer':
                    indexer = labels.get_indexer(keyarr)
                    if (indexer >= 0).all():
                        self.obj.take(indexer, axis=axis)
                    else:
                        return self.obj.take(keyarr, axis=axis)
                elif not labels.inferred_type == 'integer':
                    return self.obj.take(keyarr, axis=axis)

            # this is not the most robust, but...
            if (isinstance(labels, MultiIndex) and
                not isinstance(keyarr[0], tuple)):
                level = 0
            else:
                level = None

            if labels.is_unique:
                return _reindex(keyarr, level=level)
            else:
                mask = labels.isin(keyarr)
                return self.obj.take(mask.nonzero()[0], axis=axis)
Exemple #21
0
    def _read_panel_table(self, group, where=None):
        from pandas.core.common import _asarray_tuplesafe

        table = getattr(group, 'table')

        # create the selection
        sel = Selection(table, where)
        sel.select()
        fields = table._v_attrs.fields

        columns = _maybe_convert(sel.values['column'],
                                 table._v_attrs.columns_kind)
        index = _maybe_convert(sel.values['index'], table._v_attrs.index_kind)
        # reconstruct
        long_index = MultiIndex.from_arrays([index, columns])
        lp = LongPanel(sel.values['values'], index=long_index, columns=fields)

        if lp.consistent:
            lp = lp.sortlevel(level=0)
            wp = lp.to_wide()
        else:
            if not self._quiet:  # pragma: no cover
                print(
                    'Duplicate entries in table, taking most recently '
                    'appended')

            # need a better algorithm
            tuple_index = long_index.get_tuple_index()
            index_map = lib.map_indices_object(tuple_index)

            unique_tuples = lib.fast_unique(tuple_index)
            unique_tuples = _asarray_tuplesafe(unique_tuples)

            indexer = lib.merge_indexer_object(unique_tuples, index_map)

            new_index = long_index.take(indexer)
            new_values = lp.values.take(indexer, axis=0)

            lp = LongPanel(new_values, index=new_index, columns=lp.columns)
            wp = lp.to_wide()

        if sel.column_filter:
            new_minor = sorted(set(wp.minor_axis) & sel.column_filter)
            wp = wp.reindex(minor=new_minor)
        return wp
Exemple #22
0
    def _convert_for_reindex(self, key, axis=0):
        labels = self.obj._get_axis(axis)

        if com._is_bool_indexer(key):
            key = _check_bool_indexer(labels, key)
            return labels[np.asarray(key)]
        else:
            if isinstance(key, Index):
                # want Index objects to pass through untouched
                keyarr = key
            else:
                # asarray can be unsafe, NumPy strings are weird
                keyarr = _asarray_tuplesafe(key)

            if _is_integer_dtype(keyarr) and not _is_integer_index(labels):
                return labels.take(keyarr)

            return keyarr
Exemple #23
0
    def test_to_tuples_na(self, tuples, na_tuple):
        # GH 18756
        idx = IntervalIndex.from_tuples(tuples)
        result = idx.to_tuples(na_tuple=na_tuple)

        # check the non-NA portion
        expected_notna = Index(com._asarray_tuplesafe(tuples[:-1]))
        result_notna = result[:-1]
        tm.assert_index_equal(result_notna, expected_notna)

        # check the NA portion
        result_na = result[-1]
        if na_tuple:
            assert isinstance(result_na, tuple)
            assert len(result_na) == 2
            assert all(isna(x) for x in result_na)
        else:
            assert isna(result_na)
Exemple #24
0
    def _getitem_iterable(self, key, axis=0):
        labels = self.obj._get_axis(axis)
        axis_name = self.obj._get_axis_name(axis)

        if com._is_bool_indexer(key):
            key = _check_bool_indexer(labels, key)
            return self.obj.reindex(**{axis_name: labels[np.asarray(key)]})
        else:
            if isinstance(key, Index):
                # want Index objects to pass through untouched
                keyarr = key
            else:
                # asarray can be unsafe, NumPy strings are weird
                keyarr = _asarray_tuplesafe(key)
            if _is_integer_dtype(keyarr) and not _is_integer_index(labels):
                keyarr = labels.take(keyarr)

            return self.obj.reindex(**{axis_name: keyarr})
Exemple #25
0
    def _getitem_iterable(self, key, axis=0):
        labels = self.obj._get_axis(axis)
        axis_name = self.obj._get_axis_name(axis)

        if com._is_bool_indexer(key):
            key = _check_bool_indexer(labels, key)
            return self.obj.reindex(**{axis_name: labels[np.asarray(key)]})
        else:
            if isinstance(key, Index):
                # want Index objects to pass through untouched
                keyarr = key
            else:
                # asarray can be unsafe, NumPy strings are weird
                keyarr = _asarray_tuplesafe(key)
            if _is_integer_dtype(keyarr) and not _is_integer_index(labels):
                keyarr = labels.take(keyarr)

            return self.obj.reindex(**{axis_name: keyarr})
Exemple #26
0
    def test_to_tuples_na(self, tuples, na_tuple):
        # GH 18756
        idx = IntervalIndex.from_tuples(tuples)
        result = idx.to_tuples(na_tuple=na_tuple)

        # check the non-NA portion
        expected_notna = Index(com._asarray_tuplesafe(tuples[:-1]))
        result_notna = result[:-1]
        tm.assert_index_equal(result_notna, expected_notna)

        # check the NA portion
        result_na = result[-1]
        if na_tuple:
            assert isinstance(result_na, tuple)
            assert len(result_na) == 2
            assert all(isna(x) for x in result_na)
        else:
            assert isna(result_na)
Exemple #27
0
    def _convert_for_reindex(self, key, axis=0):
        labels = self.obj._get_axis(axis)

        if com._is_bool_indexer(key):
            key = _check_bool_indexer(labels, key)
            return labels[np.asarray(key)]
        else:
            if isinstance(key, Index):
                # want Index objects to pass through untouched
                keyarr = key
            else:
                # asarray can be unsafe, NumPy strings are weird
                keyarr = _asarray_tuplesafe(key)

            if _is_integer_dtype(keyarr) and not _is_integer_index(labels):
                return labels.take(keyarr)

            return keyarr
Exemple #28
0
    def __init__(self, index, grouper=None, name=None, level=None,
                 sort=True):
        self.name = name
        self.level = level
        self.grouper = _convert_grouper(index, grouper)
        self.index = index
        self.sort = sort

        # right place for this?
        if isinstance(grouper, Series) and name is None:
            self.name = grouper.name

        # pre-computed
        self._was_factor = False

        if level is not None:
            if not isinstance(level, int):
                assert(level in index.names)
                level = index.names.index(level)

            inds = index.labels[level]
            level_index = index.levels[level]

            if self.name is None:
                self.name = index.names[level]

            # XXX complete hack

            level_values = index.levels[level].take(inds)
            if grouper is not None:
                self.grouper = level_values.map(self.grouper)
            else:
                self._was_factor = True
                self._labels = inds
                self._group_index = level_index
                self.grouper = level_values
        else:
            if isinstance(self.grouper, (list, tuple)):
                self.grouper = com._asarray_tuplesafe(self.grouper)

            # no level passed
            if not isinstance(self.grouper, np.ndarray):
                self.grouper = self.index.map(self.grouper)
Exemple #29
0
    def __init__(self, index, grouper=None, name=None, level=None, sort=True):
        self.name = name
        self.level = level
        self.grouper = _convert_grouper(index, grouper)
        self.index = index
        self.sort = sort

        # right place for this?
        if isinstance(grouper, Series) and name is None:
            self.name = grouper.name

        # pre-computed
        self._was_factor = False

        if level is not None:
            if not isinstance(level, int):
                assert (level in index.names)
                level = index.names.index(level)

            inds = index.labels[level]
            level_index = index.levels[level]

            if self.name is None:
                self.name = index.names[level]

            # XXX complete hack

            level_values = index.levels[level].take(inds)
            if grouper is not None:
                self.grouper = level_values.map(self.grouper)
            else:
                self._was_factor = True
                self._labels = inds
                self._group_index = level_index
                self.grouper = level_values
        else:
            if isinstance(self.grouper, (list, tuple)):
                self.grouper = com._asarray_tuplesafe(self.grouper)

            # no level passed
            if not isinstance(self.grouper, np.ndarray):
                self.grouper = self.index.map(self.grouper)
Exemple #30
0
    def _fancy_getitem_iterable(self, key, axis=0):
        from pandas.core.series import Series

        labels = self.frame._get_axis(axis)
        axis_name = self.frame._get_axis_name(axis)

        # asarray can be unsafe, NumPy strings are weird
        keyarr = _asarray_tuplesafe(key)

        if keyarr.dtype == np.bool_:
            if isinstance(key, Series):
                if not key.index.equals(labels):
                    raise Exception('Cannot use boolean index with misaligned '
                                    'or unequal labels')
            return self.frame.reindex(**{axis_name : labels[np.asarray(key)]})
        else:
            if _is_integer_dtype(keyarr) and not _is_integer_index(labels):
                keyarr = labels.take(keyarr)

            return self.frame.reindex(**{axis_name : keyarr})
Exemple #31
0
    def drop(self, labels):
        try:
            if not isinstance(labels, np.ndarray):
                labels = _asarray_tuplesafe(labels)
            indexer, mask = self.get_indexer(labels)
            if not mask.all():
                raise ValueError('labels %s not contained in axis' % labels[-mask])
            return self.delete(indexer)
        except Exception:
            pass

        inds = []
        for label in labels:
            loc = self.get_loc(label)
            if isinstance(loc, int):
                inds.append(loc)
            else:
                inds.extend(range(loc.start, loc.stop))

        return self.delete(inds)
Exemple #32
0
def factorize(values, sort=False, order=None, na_sentinel=-1):
    """
    Encode input values as an enumerated type or categorical variable

    Parameters
    ----------
    values : sequence
    sort :
    order :

    Returns
    -------
    """
    values = np.asarray(values)
    is_datetime = com.is_datetime64_dtype(values)
    hash_klass, values = _get_data_algo(values, _hashtables)

    uniques = []
    table = hash_klass(len(values))
    labels, counts = table.get_labels(values, uniques, 0, na_sentinel)

    labels = com._ensure_platform_int(labels)

    uniques = com._asarray_tuplesafe(uniques)
    if sort and len(counts) > 0:
        sorter = uniques.argsort()
        reverse_indexer = np.empty(len(sorter), dtype=np.int_)
        reverse_indexer.put(sorter, np.arange(len(sorter)))

        mask = labels < 0
        labels = reverse_indexer.take(labels)
        np.putmask(labels, mask, -1)

        uniques = uniques.take(sorter)
        counts = counts.take(sorter)

    if is_datetime:
        uniques = np.array(uniques, dtype='M8[ns]')

    return labels, uniques, counts
Exemple #33
0
def factorize(values, sort=False, order=None, na_sentinel=-1):
    """
    Encode input values as an enumerated type or categorical variable

    Parameters
    ----------
    values : sequence
    sort :
    order :

    Returns
    -------
    """
    values = np.asarray(values)
    is_datetime = com.is_datetime64_dtype(values)
    hash_klass, values = _get_data_algo(values, _hashtables)

    uniques = []
    table = hash_klass(len(values))
    labels, counts = table.get_labels(values, uniques, 0, na_sentinel)

    labels = com._ensure_platform_int(labels)

    uniques = com._asarray_tuplesafe(uniques)
    if sort and len(counts) > 0:
        sorter = uniques.argsort()
        reverse_indexer = np.empty(len(sorter), dtype=np.int_)
        reverse_indexer.put(sorter, np.arange(len(sorter)))

        mask = labels < 0
        labels = reverse_indexer.take(labels)
        np.putmask(labels, mask, -1)

        uniques = uniques.take(sorter)
        counts = counts.take(sorter)

    if is_datetime:
        uniques = np.array(uniques, dtype='M8[ns]')

    return labels, uniques, counts
Exemple #34
0
    def _getitem_iterable(self, key, axis=0):
        labels = self.obj._get_axis(axis)
        axis_name = self.obj._get_axis_name(axis)

        # asarray can be unsafe, NumPy strings are weird
        if isinstance(key, Index):
            # want Index objects to pass through untouched
            keyarr = key
        else:
            keyarr = _asarray_tuplesafe(key)

        if keyarr.dtype == np.bool_:
            if _is_series(key):
                if not key.index.equals(labels):
                    raise IndexingError('Cannot use boolean index with '
                                        'misaligned or unequal labels')
            return self.obj.reindex(**{axis_name: labels[np.asarray(key)]})
        else:
            if _is_integer_dtype(keyarr) and not _is_integer_index(labels):
                keyarr = labels.take(keyarr)

            return self.obj.reindex(**{axis_name: keyarr})
Exemple #35
0
    def _getitem_iterable(self, key, axis=0):
        labels = self.obj._get_axis(axis)
        axis_name = self.obj._get_axis_name(axis)

        # asarray can be unsafe, NumPy strings are weird
        if isinstance(key, Index):
            # want Index objects to pass through untouched
            keyarr = key
        else:
            keyarr = _asarray_tuplesafe(key)

        if keyarr.dtype == np.bool_:
            if _is_series(key):
                if not key.index.equals(labels):
                    raise IndexingError('Cannot use boolean index with '
                                        'misaligned or unequal labels')
            return self.obj.reindex(**{axis_name : labels[np.asarray(key)]})
        else:
            if _is_integer_dtype(keyarr) and not _is_integer_index(labels):
                keyarr = labels.take(keyarr)

            return self.obj.reindex(**{axis_name : keyarr})
Exemple #36
0
    def test_int64_overflow(self):

        B = np.concatenate((np.arange(1000), np.arange(1000), np.arange(500)))
        A = np.arange(2500)
        df = DataFrame({
            'A': A,
            'B': B,
            'C': A,
            'D': B,
            'E': A,
            'F': B,
            'G': A,
            'H': B,
            'values': np.random.randn(2500)
        })

        lg = df.groupby(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H'])
        rg = df.groupby(['H', 'G', 'F', 'E', 'D', 'C', 'B', 'A'])

        left = lg.sum()['values']
        right = rg.sum()['values']

        exp_index, _ = left.index.sortlevel()
        tm.assert_index_equal(left.index, exp_index)

        exp_index, _ = right.index.sortlevel(0)
        tm.assert_index_equal(right.index, exp_index)

        tups = list(
            map(tuple, df[['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']].values))
        tups = com._asarray_tuplesafe(tups)

        expected = df.groupby(tups).sum()['values']

        for k, v in compat.iteritems(expected):
            assert left[k] == right[k[::-1]]
            assert left[k] == v
        assert len(left) == len(right)
Exemple #37
0
    def test_int64_overflow(self):

        B = np.concatenate((np.arange(1000), np.arange(1000), np.arange(500)))
        A = np.arange(2500)
        df = DataFrame({'A': A,
                        'B': B,
                        'C': A,
                        'D': B,
                        'E': A,
                        'F': B,
                        'G': A,
                        'H': B,
                        'values': np.random.randn(2500)})

        lg = df.groupby(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H'])
        rg = df.groupby(['H', 'G', 'F', 'E', 'D', 'C', 'B', 'A'])

        left = lg.sum()['values']
        right = rg.sum()['values']

        exp_index, _ = left.index.sortlevel()
        tm.assert_index_equal(left.index, exp_index)

        exp_index, _ = right.index.sortlevel(0)
        tm.assert_index_equal(right.index, exp_index)

        tups = list(map(tuple, df[['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H'
                                   ]].values))
        tups = com._asarray_tuplesafe(tups)

        expected = df.groupby(tups).sum()['values']

        for k, v in compat.iteritems(expected):
            assert left[k] == right[k[::-1]]
            assert left[k] == v
        assert len(left) == len(right)
Exemple #38
0
    def _convert_1d(values, unit, axis):
        def try_parse(values):
            try:
                return _dt_to_float_ordinal(tools.to_datetime(values))
            except Exception:
                return values

        if isinstance(values, (datetime, pydt.date)):
            return _dt_to_float_ordinal(values)
        elif isinstance(values, np.datetime64):
            return _dt_to_float_ordinal(lib.Timestamp(values))
        elif isinstance(values, pydt.time):
            return dates.date2num(values)
        elif (is_integer(values) or is_float(values)):
            return values
        elif isinstance(values, compat.string_types):
            return try_parse(values)
        elif isinstance(values, (list, tuple, np.ndarray, Index)):
            if isinstance(values, Index):
                values = values.values
            if not isinstance(values, np.ndarray):
                values = com._asarray_tuplesafe(values)

            if is_integer_dtype(values) or is_float_dtype(values):
                return values

            try:
                values = tools.to_datetime(values)
                if isinstance(values, Index):
                    values = _dt_to_float_ordinal(values)
                else:
                    values = [_dt_to_float_ordinal(x) for x in values]
            except Exception:
                values = _dt_to_float_ordinal(values)

        return values
def match(to_match, values, na_sentinel=-1):
    """
    Compute locations of to_match into values

    Parameters
    ----------
    to_match : array-like
        values to find positions of
    values : array-like
        Unique set of values
    na_sentinel : int, default -1
        Value to mark "not found"

    Examples
    --------

    Returns
    -------
    match : ndarray of integers
    """
    values = com._asarray_tuplesafe(values)
    if issubclass(values.dtype.type, string_types):
        values = np.array(values, dtype='O')

    f = lambda htype, caster: _match_generic(to_match, values, htype, caster)
    result = _hashtable_algo(f, values.dtype, np.int64)

    if na_sentinel != -1:

        # replace but return a numpy array
        # use a Series because it handles dtype conversions properly
        from pandas.core.series import Series
        result = Series(result.ravel()).replace(
            -1, na_sentinel).values.reshape(result.shape)

    return result
Exemple #40
0
 def test_to_tuples(self, tuples):
     # GH 18756
     idx = IntervalIndex.from_tuples(tuples)
     result = idx.to_tuples()
     expected = Index(com._asarray_tuplesafe(tuples))
     tm.assert_index_equal(result, expected)
Exemple #41
0
    def _convert_to_indexer(self, obj, axis=0):
        """
        Convert indexing key into something we can use to do actual fancy
        indexing on an ndarray

        Examples
        ix[:5] -> slice(0, 5)
        ix[[1,2,3]] -> [1,2,3]
        ix[['foo', 'bar', 'baz']] -> [i, j, k] (indices of foo, bar, baz)

        Going by Zen of Python?
        "In the face of ambiguity, refuse the temptation to guess."
        raise AmbiguousIndexError with integer labels?
        - No, prefer label-based indexing
        """
        labels = self.obj._get_axis(axis)
        is_int_index = _is_integer_index(labels)

        if com.is_integer(obj) and not is_int_index:
            return obj

        try:
            return labels.get_loc(obj)
        except (KeyError, TypeError):
            pass

        if isinstance(obj, slice):
            ltype = labels.inferred_type

            if ltype == 'floating':
                int_slice = _is_int_slice(obj)
            else:
                # floats that are within tolerance of int used
                int_slice = _is_index_slice(obj)

            null_slice = obj.start is None and obj.stop is None
            # could have integers in the first level of the MultiIndex
            position_slice = (int_slice and not ltype == 'integer'
                              and not isinstance(labels, MultiIndex))

            start, stop = obj.start, obj.stop

            # last ditch effort: if we are mixed and have integers
            try:
                if 'mixed' in ltype and int_slice:
                    if start is not None:
                        i = labels.get_loc(start)
                    if stop is not None:
                        j = labels.get_loc(stop)
                    position_slice = False
            except KeyError:
                if ltype == 'mixed-integer-float':
                    raise

            if null_slice or position_slice:
                slicer = obj
            else:
                try:
                    i, j = labels.slice_locs(start, stop)
                    slicer = slice(i, j, obj.step)
                except Exception:
                    if _is_index_slice(obj):
                        if labels.inferred_type == 'integer':
                            raise
                        slicer = obj
                    else:
                        raise

            return slicer

        elif _is_list_like(obj):
            if com._is_bool_indexer(obj):
                objarr = _check_bool_indexer(labels, obj)
                return objarr
            else:
                if isinstance(obj, Index):
                    objarr = obj.values
                else:
                    objarr = _asarray_tuplesafe(obj)

                # If have integer labels, defer to label-based indexing
                if _is_integer_dtype(objarr) and not is_int_index:
                    if labels.inferred_type != 'integer':
                        objarr = np.where(objarr < 0,
                                          len(labels) + objarr, objarr)
                    return objarr

                # this is not the most robust, but...
                if (isinstance(labels, MultiIndex)
                        and not isinstance(objarr[0], tuple)):
                    level = 0
                    _, indexer = labels.reindex(objarr, level=level)

                    check = labels.levels[0].get_indexer(objarr)
                else:
                    level = None
                    # XXX
                    if labels.is_unique:
                        indexer = check = labels.get_indexer(objarr)
                    else:
                        mask = np.zeros(len(labels), dtype=bool)
                        lvalues = labels.values
                        for x in objarr:
                            # ugh
                            to_or = lib.map_infer(lvalues, x.__eq__)
                            if not to_or.any():
                                raise KeyError('%s not in index' % str(x))
                            mask |= to_or

                        indexer = check = mask.nonzero()[0]

                mask = check == -1
                if mask.any():
                    raise KeyError('%s not in index' % objarr[mask])

                return indexer
        else:
            return labels.get_loc(obj)
Exemple #42
0
    def _getitem_iterable(self, key, axis=0):
        labels = self.obj._get_axis(axis)

        def _reindex(keys, level=None):
            try:
                return self.obj.reindex_axis(keys, axis=axis, level=level)
            except AttributeError:
                # Series
                if axis != 0:
                    raise AssertionError('axis must be 0')
                return self.obj.reindex(keys, level=level)

        if com._is_bool_indexer(key):
            key = _check_bool_indexer(labels, key)
            inds, = key.nonzero()
            return self.obj.take(inds, axis=axis, convert=False)
        else:
            if isinstance(key, Index):
                # want Index objects to pass through untouched
                keyarr = key
            else:
                # asarray can be unsafe, NumPy strings are weird
                keyarr = _asarray_tuplesafe(key)

            if _is_integer_dtype(keyarr):
                if labels.inferred_type != 'integer':
                    keyarr = np.where(keyarr < 0,
                                      len(labels) + keyarr, keyarr)

                if labels.inferred_type == 'mixed-integer':
                    indexer = labels.get_indexer(keyarr)
                    if (indexer >= 0).all():
                        self.obj.take(indexer, axis=axis, convert=True)
                    else:
                        return self.obj.take(keyarr, axis=axis)
                elif not labels.inferred_type == 'integer':

                    return self.obj.take(keyarr, axis=axis)

            # this is not the most robust, but...
            if (isinstance(labels, MultiIndex) and
                    not isinstance(keyarr[0], tuple)):
                level = 0
            else:
                level = None

            if labels.is_unique and Index(keyarr).is_unique:
                return _reindex(keyarr, level=level)
            else:
                indexer, missing = labels.get_indexer_non_unique(keyarr)
                check = indexer != -1
                result = self.obj.take(indexer[check], axis=axis, convert=False)

                # need to merge the result labels and the missing labels
                if len(missing):
                    l = np.arange(len(indexer))

                    missing = com._ensure_platform_int(missing)
                    missing_labels = keyarr.take(missing)
                    missing_indexer = com._ensure_int64(l[~check])
                    cur_labels = result._get_axis(axis).values
                    cur_indexer = com._ensure_int64(l[check])

                    new_labels = np.empty(tuple([len(indexer)]),dtype=object)
                    new_labels[cur_indexer]     = cur_labels
                    new_labels[missing_indexer] = missing_labels
                    new_indexer = (Index(cur_indexer) + Index(missing_indexer)).values
                    new_indexer[missing_indexer] = -1

                    # reindex with the specified axis
                    ndim = self.obj.ndim
                    if axis+1 > ndim:
                        raise AssertionError("invalid indexing error with non-unique index")

                    args = [None] * (2*ndim)
                    args[2*axis] = new_labels
                    args[2*axis+1] = new_indexer

                    result = result._reindex_with_indexers(*args, copy=False, fill_value=np.nan)

                return result
Exemple #43
0
def rolling_window(arg, window=None, win_type=None, min_periods=None,
                   freq=None, center=False, mean=True,
                   axis=0, how=None, **kwargs):
    """
    Applies a moving window of type ``window_type`` and size ``window``
    on the data.

    Parameters
    ----------
    arg : Series, DataFrame
    window : int or ndarray
        Weighting window specification. If the window is an integer, then it is
        treated as the window length and win_type is required
    win_type : str, default None
        Window type (see Notes)
    min_periods : int, default None
        Minimum number of observations in window required to have a value
        (otherwise result is NA).
    freq : string or DateOffset object, optional (default None)
        Frequency to conform the data to before computing the statistic. Specified
        as a frequency string or DateOffset object.
    center : boolean, default False
        Whether the label should correspond with center of window
    mean : boolean, default True
        If True computes weighted mean, else weighted sum
    axis : {0, 1}, default 0
    how : string, default 'mean'
        Method for down- or re-sampling

    Returns
    -------
    y : type of input argument

    Notes
    -----
    The recognized window types are:

    * ``boxcar``
    * ``triang``
    * ``blackman``
    * ``hamming``
    * ``bartlett``
    * ``parzen``
    * ``bohman``
    * ``blackmanharris``
    * ``nuttall``
    * ``barthann``
    * ``kaiser`` (needs beta)
    * ``gaussian`` (needs std)
    * ``general_gaussian`` (needs power, width)
    * ``slepian`` (needs width).

    By default, the result is set to the right edge of the window. This can be
    changed to the center of the window by setting ``center=True``.

    The `freq` keyword is used to conform time series data to a specified
    frequency by resampling the data. This is done with the default parameters
    of :meth:`~pandas.Series.resample` (i.e. using the `mean`).
    """
    if isinstance(window, (list, tuple, np.ndarray)):
        if win_type is not None:
            raise ValueError(('Do not specify window type if using custom '
                              'weights'))
        window = pdcom._asarray_tuplesafe(window).astype(float)
    elif pdcom.is_integer(window):  # window size
        if win_type is None:
            raise ValueError('Must specify window type')
        try:
            import scipy.signal as sig
        except ImportError:
            raise ImportError('Please install scipy to generate window weight')
        win_type = _validate_win_type(win_type, kwargs)  # may pop from kwargs
        window = sig.get_window(win_type, window).astype(float)
    else:
        raise ValueError('Invalid window %s' % str(window))

    minp = _use_window(min_periods, len(window))

    arg = _conv_timerule(arg, freq, how)
    return_hook, values = _process_data_structure(arg)

    f = lambda x: algos.roll_window(x, window, minp, avg=mean)
    result = np.apply_along_axis(f, axis, values)

    rs = return_hook(result)
    if center:
        rs = _center_window(rs, len(window), axis)
    return rs
Exemple #44
0
def rolling_window(arg,
                   window=None,
                   win_type=None,
                   min_periods=None,
                   freq=None,
                   center=False,
                   mean=True,
                   axis=0,
                   how=None,
                   **kwargs):
    """
    Applies a moving window of type ``window_type`` and size ``window``
    on the data.

    Parameters
    ----------
    arg : Series, DataFrame
    window : int or ndarray
        Weighting window specification. If the window is an integer, then it is
        treated as the window length and win_type is required
    win_type : str, default None
        Window type (see Notes)
    min_periods : int, default None
        Minimum number of observations in window required to have a value
        (otherwise result is NA).
    freq : string or DateOffset object, optional (default None)
        Frequency to conform the data to before computing the statistic. Specified
        as a frequency string or DateOffset object.
    center : boolean, default False
        Whether the label should correspond with center of window
    mean : boolean, default True
        If True computes weighted mean, else weighted sum
    axis : {0, 1}, default 0
    how : string, default 'mean'
        Method for down- or re-sampling

    Returns
    -------
    y : type of input argument

    Notes
    -----
    The recognized window types are:

    * ``boxcar``
    * ``triang``
    * ``blackman``
    * ``hamming``
    * ``bartlett``
    * ``parzen``
    * ``bohman``
    * ``blackmanharris``
    * ``nuttall``
    * ``barthann``
    * ``kaiser`` (needs beta)
    * ``gaussian`` (needs std)
    * ``general_gaussian`` (needs power, width)
    * ``slepian`` (needs width).

    By default, the result is set to the right edge of the window. This can be
    changed to the center of the window by setting ``center=True``.

    The `freq` keyword is used to conform time series data to a specified
    frequency by resampling the data. This is done with the default parameters
    of :meth:`~pandas.Series.resample` (i.e. using the `mean`).
    """
    if isinstance(window, (list, tuple, np.ndarray)):
        if win_type is not None:
            raise ValueError(('Do not specify window type if using custom '
                              'weights'))
        window = pdcom._asarray_tuplesafe(window).astype(float)
    elif pdcom.is_integer(window):  # window size
        if win_type is None:
            raise ValueError('Must specify window type')
        try:
            import scipy.signal as sig
        except ImportError:
            raise ImportError('Please install scipy to generate window weight')
        win_type = _validate_win_type(win_type, kwargs)  # may pop from kwargs
        window = sig.get_window(win_type, window).astype(float)
    else:
        raise ValueError('Invalid window %s' % str(window))

    minp = _use_window(min_periods, len(window))

    arg = _conv_timerule(arg, freq, how)
    return_hook, values = _process_data_structure(arg)

    offset = int((len(window) - 1) / 2.) if center else 0
    additional_nans = np.array([np.NaN] * offset)
    f = lambda x: algos.roll_window(np.concatenate((x, additional_nans))
                                    if center else x,
                                    window,
                                    minp,
                                    avg=mean)
    result = np.apply_along_axis(f, axis, values)

    if center:
        result = _center_window(result, len(window), axis)

    return return_hook(result)
Exemple #45
0
    def _convert_to_indexer(self, obj, axis=0):
        """
        Convert indexing key into something we can use to do actual fancy
        indexing on an ndarray

        Examples
        ix[:5] -> slice(0, 5)
        ix[[1,2,3]] -> [1,2,3]
        ix[['foo', 'bar', 'baz']] -> [i, j, k] (indices of foo, bar, baz)

        Going by Zen of Python?
        "In the face of ambiguity, refuse the temptation to guess."
        raise AmbiguousIndexError with integer labels?
        - No, prefer label-based indexing
        """
        labels = self.obj._get_axis(axis)
        is_int_index = _is_integer_index(labels)

        if com.is_integer(obj) and not is_int_index:
            return obj

        try:
            return labels.get_loc(obj)
        except (KeyError, TypeError):
            pass

        if isinstance(obj, slice):

            int_slice = _is_index_slice(obj)
            null_slice = obj.start is None and obj.stop is None
            # could have integers in the first level of the MultiIndex
            position_slice = (int_slice
                              and not labels.inferred_type == 'integer'
                              and not isinstance(labels, MultiIndex))

            start, stop = obj.start, obj.stop

            # last ditch effort: if we are mixed and have integers
            try:
                if 'mixed' in labels.inferred_type and int_slice:
                    if start is not None:
                        i = labels.get_loc(start)
                    if stop is not None:
                        j = labels.get_loc(stop)
                    position_slice = False
            except KeyError:
                if labels.inferred_type == 'mixed-integer':
                    raise

            if null_slice or position_slice:
                slicer = obj
            else:
                try:
                    i, j = labels.slice_locs(start, stop)
                    slicer = slice(i, j, obj.step)
                except Exception:
                    if _is_index_slice(obj):
                        if labels.inferred_type == 'integer':
                            raise
                        slicer = obj
                    else:
                        raise

            return slicer

        elif _is_list_like(obj):
            if com._is_bool_indexer(obj):
                objarr = _check_bool_indexer(labels, obj)
                return objarr
            else:
                objarr = _asarray_tuplesafe(obj)

                # If have integer labels, defer to label-based indexing
                if _is_integer_dtype(objarr) and not is_int_index:
                    return objarr

                indexer = labels.get_indexer(objarr)
                mask = indexer == -1
                if mask.any():
                    raise KeyError('%s not in index' % objarr[mask])

                return indexer
        else:
            return labels.get_loc(obj)
Exemple #46
0
    def __init__(self,
                 index,
                 grouper=None,
                 obj=None,
                 name=None,
                 level=None,
                 sort=True,
                 observed=False,
                 in_axis=False):

        self.name = name
        self.level = level
        self.grouper = _convert_grouper(index, grouper)
        self.all_grouper = None
        self.index = index
        self.sort = sort
        self.obj = obj
        self.observed = observed
        self.in_axis = in_axis

        # right place for this?
        if isinstance(grouper, (Series, Index)) and name is None:
            self.name = grouper.name

        if isinstance(grouper, MultiIndex):
            self.grouper = grouper.values

        # we have a single grouper which may be a myriad of things,
        # some of which are dependent on the passing in level

        if level is not None:
            if not isinstance(level, int):
                if level not in index.names:
                    raise AssertionError('Level %s not in index' % str(level))
                level = index.names.index(level)

            if self.name is None:
                self.name = index.names[level]

            self.grouper, self._labels, self._group_index = \
                index._get_grouper_for_level(self.grouper, level)

        # a passed Grouper like, directly get the grouper in the same way
        # as single grouper groupby, use the group_info to get labels
        elif isinstance(self.grouper, Grouper):
            # get the new grouper; we already have disambiguated
            # what key/level refer to exactly, don't need to
            # check again as we have by this point converted these
            # to an actual value (rather than a pd.Grouper)
            _, grouper, _ = self.grouper._get_grouper(self.obj, validate=False)
            if self.name is None:
                self.name = grouper.result_index.name
            self.obj = self.grouper.obj
            self.grouper = grouper

        else:
            if self.grouper is None and self.name is not None:
                self.grouper = self.obj[self.name]

            elif isinstance(self.grouper, (list, tuple)):
                self.grouper = com._asarray_tuplesafe(self.grouper)

            # a passed Categorical
            elif is_categorical_dtype(self.grouper):

                from pandas.core.groupby.categorical import recode_for_groupby
                self.grouper, self.all_grouper = recode_for_groupby(
                    self.grouper, self.sort, observed)
                categories = self.grouper.categories

                # we make a CategoricalIndex out of the cat grouper
                # preserving the categories / ordered attributes
                self._labels = self.grouper.codes
                if observed:
                    codes = algorithms.unique1d(self.grouper.codes)
                else:
                    codes = np.arange(len(categories))

                self._group_index = CategoricalIndex(
                    Categorical.from_codes(codes=codes,
                                           categories=categories,
                                           ordered=self.grouper.ordered))

            # we are done
            if isinstance(self.grouper, Grouping):
                self.grouper = self.grouper.grouper

            # no level passed
            elif not isinstance(self.grouper,
                                (Series, Index, ExtensionArray, np.ndarray)):
                if getattr(self.grouper, 'ndim', 1) != 1:
                    t = self.name or str(type(self.grouper))
                    raise ValueError("Grouper for '%s' not 1-dimensional" % t)
                self.grouper = self.index.map(self.grouper)
                if not (hasattr(self.grouper, "__len__")
                        and len(self.grouper) == len(self.index)):
                    errmsg = ('Grouper result violates len(labels) == '
                              'len(data)\nresult: %s' %
                              pprint_thing(self.grouper))
                    self.grouper = None  # Try for sanity
                    raise AssertionError(errmsg)

        # if we have a date/time-like grouper, make sure that we have
        # Timestamps like
        if getattr(self.grouper, 'dtype', None) is not None:
            if is_datetime64_dtype(self.grouper):
                from pandas import to_datetime
                self.grouper = to_datetime(self.grouper)
            elif is_timedelta64_dtype(self.grouper):
                from pandas import to_timedelta
                self.grouper = to_timedelta(self.grouper)
Exemple #47
0
def _get_grouper(obj,
                 key=None,
                 axis=0,
                 level=None,
                 sort=True,
                 observed=False,
                 mutated=False,
                 validate=True):
    """
    create and return a BaseGrouper, which is an internal
    mapping of how to create the grouper indexers.
    This may be composed of multiple Grouping objects, indicating
    multiple groupers

    Groupers are ultimately index mappings. They can originate as:
    index mappings, keys to columns, functions, or Groupers

    Groupers enable local references to axis,level,sort, while
    the passed in axis, level, and sort are 'global'.

    This routine tries to figure out what the passing in references
    are and then creates a Grouping for each one, combined into
    a BaseGrouper.

    If observed & we have a categorical grouper, only show the observed
    values

    If validate, then check for key/level overlaps

    """
    group_axis = obj._get_axis(axis)

    # validate that the passed single level is compatible with the passed
    # axis of the object
    if level is not None:
        # TODO: These if-block and else-block are almost same.
        # MultiIndex instance check is removable, but it seems that there are
        # some processes only for non-MultiIndex in else-block,
        # eg. `obj.index.name != level`. We have to consider carefully whether
        # these are applicable for MultiIndex. Even if these are applicable,
        # we need to check if it makes no side effect to subsequent processes
        # on the outside of this condition.
        # (GH 17621)
        if isinstance(group_axis, MultiIndex):
            if is_list_like(level) and len(level) == 1:
                level = level[0]

            if key is None and is_scalar(level):
                # Get the level values from group_axis
                key = group_axis.get_level_values(level)
                level = None

        else:
            # allow level to be a length-one list-like object
            # (e.g., level=[0])
            # GH 13901
            if is_list_like(level):
                nlevels = len(level)
                if nlevels == 1:
                    level = level[0]
                elif nlevels == 0:
                    raise ValueError('No group keys passed!')
                else:
                    raise ValueError('multiple levels only valid with '
                                     'MultiIndex')

            if isinstance(level, compat.string_types):
                if obj.index.name != level:
                    raise ValueError('level name %s is not the name of the '
                                     'index' % level)
            elif level > 0 or level < -1:
                raise ValueError('level > 0 or level < -1 only valid with '
                                 ' MultiIndex')

            # NOTE: `group_axis` and `group_axis.get_level_values(level)`
            # are same in this section.
            level = None
            key = group_axis

    # a passed-in Grouper, directly convert
    if isinstance(key, Grouper):
        binner, grouper, obj = key._get_grouper(obj, validate=False)
        if key.key is None:
            return grouper, [], obj
        else:
            return grouper, set([key.key]), obj

    # already have a BaseGrouper, just return it
    elif isinstance(key, BaseGrouper):
        return key, [], obj

    # In the future, a tuple key will always mean an actual key,
    # not an iterable of keys. In the meantime, we attempt to provide
    # a warning. We can assume that the user wanted a list of keys when
    # the key is not in the index. We just have to be careful with
    # unhashble elements of `key`. Any unhashable elements implies that
    # they wanted a list of keys.
    # https://github.com/pandas-dev/pandas/issues/18314
    is_tuple = isinstance(key, tuple)
    all_hashable = is_tuple and is_hashable(key)

    if is_tuple:
        if ((all_hashable and key not in obj and set(key).issubset(obj))
                or not all_hashable):
            # column names ('a', 'b') -> ['a', 'b']
            # arrays like (a, b) -> [a, b]
            msg = ("Interpreting tuple 'by' as a list of keys, rather than "
                   "a single key. Use 'by=[...]' instead of 'by=(...)'. In "
                   "the future, a tuple will always mean a single key.")
            warnings.warn(msg, FutureWarning, stacklevel=5)
            key = list(key)

    if not isinstance(key, list):
        keys = [key]
        match_axis_length = False
    else:
        keys = key
        match_axis_length = len(keys) == len(group_axis)

    # what are we after, exactly?
    any_callable = any(callable(g) or isinstance(g, dict) for g in keys)
    any_groupers = any(isinstance(g, Grouper) for g in keys)
    any_arraylike = any(
        isinstance(g, (list, tuple, Series, Index, np.ndarray)) for g in keys)

    try:
        if isinstance(obj, DataFrame):
            all_in_columns_index = all(g in obj.columns or g in obj.index.names
                                       for g in keys)
        else:
            all_in_columns_index = False
    except Exception:
        all_in_columns_index = False

    if not any_callable and not all_in_columns_index and \
       not any_arraylike and not any_groupers and \
       match_axis_length and level is None:
        keys = [com._asarray_tuplesafe(keys)]

    if isinstance(level, (tuple, list)):
        if key is None:
            keys = [None] * len(level)
        levels = level
    else:
        levels = [level] * len(keys)

    groupings = []
    exclusions = []

    # if the actual grouper should be obj[key]
    def is_in_axis(key):
        if not _is_label_like(key):
            try:
                obj._data.items.get_loc(key)
            except Exception:
                return False

        return True

    # if the grouper is obj[name]
    def is_in_obj(gpr):
        try:
            return id(gpr) == id(obj[gpr.name])
        except Exception:
            return False

    for i, (gpr, level) in enumerate(zip(keys, levels)):

        if is_in_obj(gpr):  # df.groupby(df['name'])
            in_axis, name = True, gpr.name
            exclusions.append(name)

        elif is_in_axis(gpr):  # df.groupby('name')
            if gpr in obj:
                if validate:
                    stacklevel = 5  # Number of stack levels from df.groupby
                    obj._check_label_or_level_ambiguity(gpr,
                                                        stacklevel=stacklevel)
                in_axis, name, gpr = True, gpr, obj[gpr]
                exclusions.append(name)
            elif obj._is_level_reference(gpr):
                in_axis, name, level, gpr = False, None, gpr, None
            else:
                raise KeyError(gpr)
        elif isinstance(gpr, Grouper) and gpr.key is not None:
            # Add key to exclusions
            exclusions.append(gpr.key)
            in_axis, name = False, None
        else:
            in_axis, name = False, None

        if is_categorical_dtype(gpr) and len(gpr) != obj.shape[axis]:
            raise ValueError(
                ("Length of grouper ({len_gpr}) and axis ({len_axis})"
                 " must be same length".format(len_gpr=len(gpr),
                                               len_axis=obj.shape[axis])))

        # create the Grouping
        # allow us to passing the actual Grouping as the gpr
        ping = Grouping(group_axis,
                        gpr,
                        obj=obj,
                        name=name,
                        level=level,
                        sort=sort,
                        observed=observed,
                        in_axis=in_axis) \
            if not isinstance(gpr, Grouping) else gpr

        groupings.append(ping)

    if len(groupings) == 0:
        raise ValueError('No group keys passed!')

    # create the internals grouper
    grouper = BaseGrouper(group_axis, groupings, sort=sort, mutated=mutated)
    return grouper, exclusions, obj
Exemple #48
0
 def to_tuples(self, na_tuple=True):
     tuples = com._asarray_tuplesafe(zip(self.left, self.right))
     if not na_tuple:
         # GH 18756
         tuples = np.where(~self.isna(), tuples, np.nan)
     return tuples
Exemple #49
0
def _check_groupby(df, result, keys, field, f=lambda x: x.sum()):
    tups = lmap(tuple, df[keys].values)
    tups = com._asarray_tuplesafe(tups)
    expected = f(df.groupby(tups)[field])
    for k, v in compat.iteritems(expected):
        assert (result[k] == v)
Exemple #50
0
 def _convert_arr_indexer(self, keyarr):
     keyarr = _asarray_tuplesafe(keyarr)
     return self._shallow_copy(keyarr)
Exemple #51
0
    def _read_panel_table(self, group, where=None):
        table = getattr(group, 'table')
        fields = table._v_attrs.fields

        # create the selection
        sel = Selection(table, where, table._v_attrs.index_kind)
        sel.select()
        fields = table._v_attrs.fields

        columns = _maybe_convert(sel.values['column'],
                                 table._v_attrs.columns_kind)
        index = _maybe_convert(sel.values['index'], table._v_attrs.index_kind)
        values = sel.values['values']

        major = Factor.from_array(index)
        minor = Factor.from_array(columns)

        J, K = len(major.levels), len(minor.levels)
        key = major.labels * K + minor.labels

        if len(unique(key)) == len(key):
            sorter, _ = lib.groupsort_indexer(com._ensure_int64(key), J * K)
            sorter = com._ensure_platform_int(sorter)

            # the data need to be sorted
            sorted_values = values.take(sorter, axis=0)
            major_labels = major.labels.take(sorter)
            minor_labels = minor.labels.take(sorter)

            block = block2d_to_block3d(sorted_values, fields, (J, K),
                                       major_labels, minor_labels)

            mgr = BlockManager([block],
                               [block.ref_items, major.levels, minor.levels])
            wp = Panel(mgr)
        else:
            if not self._quiet:  # pragma: no cover
                print(
                    'Duplicate entries in table, taking most recently '
                    'appended')

            # reconstruct
            long_index = MultiIndex.from_arrays([index, columns])
            lp = DataFrame(values, index=long_index, columns=fields)

            # need a better algorithm
            tuple_index = long_index._tuple_index

            unique_tuples = lib.fast_unique(tuple_index)
            unique_tuples = _asarray_tuplesafe(unique_tuples)

            indexer = match(unique_tuples, tuple_index)
            indexer = com._ensure_platform_int(indexer)

            new_index = long_index.take(indexer)
            new_values = lp.values.take(indexer, axis=0)

            lp = DataFrame(new_values, index=new_index, columns=lp.columns)
            wp = lp.to_panel()

        if sel.column_filter:
            new_minor = sorted(set(wp.minor_axis) & sel.column_filter)
            wp = wp.reindex(minor=new_minor)
        return wp
Exemple #52
0
    def _convert_to_indexer(self, obj, axis=0):
        """
        Convert indexing key into something we can use to do actual fancy
        indexing on an ndarray

        Examples
        ix[:5] -> slice(0, 5)
        ix[[1,2,3]] -> [1,2,3]
        ix[['foo', 'bar', 'baz']] -> [i, j, k] (indices of foo, bar, baz)

        Going by Zen of Python?
        "In the face of ambiguity, refuse the temptation to guess."
        raise AmbiguousIndexError with integer labels?
        - No, prefer label-based indexing
        """
        labels = self.obj._get_axis(axis)
        is_int_index = _is_integer_index(labels)

        if com.is_integer(obj) and not is_int_index:
            return obj

        try:
            return labels.get_loc(obj)
        except (KeyError, TypeError):
            pass

        if isinstance(obj, slice):
            ltype = labels.inferred_type

            # in case of providing all floats, use label-based indexing
            float_slice = (labels.inferred_type == 'floating'
                           and _is_float_slice(obj))

            # floats that are within tolerance of int used as positions
            int_slice = _is_index_slice(obj)

            null_slice = obj.start is None and obj.stop is None

            # could have integers in the first level of the MultiIndex,
            # in which case we wouldn't want to do position-based slicing
            position_slice = (int_slice
                              and not ltype == 'integer'
                              and not isinstance(labels, MultiIndex)
                              and not float_slice)

            start, stop = obj.start, obj.stop

            # last ditch effort: if we are mixed and have integers
            try:
                if position_slice and 'mixed' in ltype:
                    if start is not None:
                        i = labels.get_loc(start)
                    if stop is not None:
                        j = labels.get_loc(stop)
                    position_slice = False
            except KeyError:
                if ltype == 'mixed-integer-float':
                    raise

            if null_slice or position_slice:
                indexer = obj
            else:
                try:
                    indexer = labels.slice_indexer(start, stop, obj.step)
                except Exception:
                    if _is_index_slice(obj):
                        if ltype == 'integer':
                            raise
                        indexer = obj
                    else:
                        raise

            return indexer

        elif _is_list_like(obj):
            if com._is_bool_indexer(obj):
                obj = _check_bool_indexer(labels, obj)
                inds, = obj.nonzero()
                return inds
            else:
                if isinstance(obj, Index):
                    objarr = obj.values
                else:
                    objarr = _asarray_tuplesafe(obj)

                # If have integer labels, defer to label-based indexing
                if _is_integer_dtype(objarr) and not is_int_index:
                    if labels.inferred_type != 'integer':
                        objarr = np.where(objarr < 0,
                                          len(labels) + objarr, objarr)
                    return objarr

                # this is not the most robust, but...
                if (isinstance(labels, MultiIndex) and
                        not isinstance(objarr[0], tuple)):
                    level = 0
                    _, indexer = labels.reindex(objarr, level=level)

                    check = labels.levels[0].get_indexer(objarr)
                else:
                    level = None

                    # unique index
                    if labels.is_unique:
                        indexer = check = labels.get_indexer(objarr)

                    # non-unique (dups)
                    else:
                        indexer, missing = labels.get_indexer_non_unique(objarr)
                        check = indexer

                mask = check == -1
                if mask.any():
                    raise KeyError('%s not in index' % objarr[mask])
            
                return indexer

        else:
            return labels.get_loc(obj)
Exemple #53
0
def rolling_window(arg, window=None, win_type=None, min_periods=None,
                   freq=None, center=False, mean=True, time_rule=None,
                   axis=0, **kwargs):
    """
    Applies a moving window of type ``window_type`` and size ``window``
    on the data.

    Parameters
    ----------
    arg : Series, DataFrame
    window : int or ndarray
        Weighting window specification. If the window is an integer, then it is
        treated as the window length and win_type is required
    win_type : str, default None
        Window type (see Notes)
    min_periods : int
        Minimum number of observations in window required to have a value.
    freq : None or string alias / date offset object, default=None
        Frequency to conform to before computing statistic
    center : boolean, default False
        Whether the label should correspond with center of window
    mean : boolean, default True
        If True computes weighted mean, else weighted sum

    Returns
    -------
    y : type of input argument

    Notes
    -----
    The recognized window types are:

    * ``boxcar``
    * ``triang``
    * ``blackman``
    * ``hamming``
    * ``bartlett``
    * ``parzen``
    * ``bohman``
    * ``blackmanharris``
    * ``nuttall``
    * ``barthann``
    * ``kaiser`` (needs beta)
    * ``gaussian`` (needs std)
    * ``general_gaussian`` (needs power, width)
    * ``slepian`` (needs width).
    """
    if isinstance(window, (list, tuple, np.ndarray)):
        if win_type is not None:
            raise ValueError(('Do not specify window type if using custom '
                              'weights'))
        window = com._asarray_tuplesafe(window).astype(float)
    elif com.is_integer(window):  # window size
        if win_type is None:
            raise ValueError('Must specify window type')
        try:
            import scipy.signal as sig
        except ImportError:
            raise ImportError('Please install scipy to generate window weight')
        win_type = _validate_win_type(win_type, kwargs)  # may pop from kwargs
        window = sig.get_window(win_type, window).astype(float)
    else:
        raise ValueError('Invalid window %s' % str(window))

    minp = _use_window(min_periods, len(window))

    arg = _conv_timerule(arg, freq, time_rule)
    return_hook, values = _process_data_structure(arg)

    f = lambda x: algos.roll_window(x, window, minp, avg=mean)
    result = np.apply_along_axis(f, axis, values)

    rs = return_hook(result)
    if center:
        rs = _center_window(rs, len(window), axis)
    return rs
Exemple #54
0
def rolling_window(arg, window=None, win_type=None, min_periods=None,
                   freq=None, center=False, mean=True, time_rule=None,
                   axis=0, **kwargs):
    """
    Applies a moving window of type ``window_type`` and size ``window``
    on the data.

    Parameters
    ----------
    arg : Series, DataFrame
    window : int or ndarray
        Weighting window specification. If the window is an integer, then it is
        treated as the window length and win_type is required
    win_type : str, default None
        Window type (see Notes)
    min_periods : int
        Minimum number of observations in window required to have a value.
    freq : None or string alias / date offset object, default=None
        Frequency to conform to before computing statistic
    center : boolean, default False
        Whether the label should correspond with center of window
    mean : boolean, default True
        If True computes weighted mean, else weighted sum

    Returns
    -------
    y : type of input argument

    Notes
    -----
    The recognized window types are:

    * ``boxcar``
    * ``triang``
    * ``blackman``
    * ``hamming``
    * ``bartlett``
    * ``parzen``
    * ``bohman``
    * ``blackmanharris``
    * ``nuttall``
    * ``barthann``
    * ``kaiser`` (needs beta)
    * ``gaussian`` (needs std)
    * ``general_gaussian`` (needs power, width)
    * ``slepian`` (needs width).
    """
    if isinstance(window, (list, tuple, np.ndarray)):
        if win_type is not None:
            raise ValueError(('Do not specify window type if using custom '
                              'weights'))
        window = com._asarray_tuplesafe(window).astype(float)
    elif com.is_integer(window):  # window size
        if win_type is None:
            raise ValueError('Must specify window type')
        try:
            import scipy.signal as sig
        except ImportError:
            raise ImportError('Please install scipy to generate window weight')
        win_type = _validate_win_type(win_type, kwargs)  # may pop from kwargs
        window = sig.get_window(win_type, window).astype(float)
    else:
        raise ValueError('Invalid window %s' % str(window))

    minp = _use_window(min_periods, len(window))

    arg = _conv_timerule(arg, freq, time_rule)
    return_hook, values = _process_data_structure(arg)

    f = lambda x: algos.roll_window(x, window, minp, avg=mean)
    result = np.apply_along_axis(f, axis, values)

    rs = return_hook(result)
    if center:
        rs = _center_window(rs, len(window), axis)
    return rs
Exemple #55
0
    def _convert_to_indexer(self, obj, axis=0):
        """
        Convert indexing key into something we can use to do actual fancy
        indexing on an ndarray

        Examples
        ix[:5] -> slice(0, 5)
        ix[[1,2,3]] -> [1,2,3]
        ix[['foo', 'bar', 'baz']] -> [i, j, k] (indices of foo, bar, baz)

        Going by Zen of Python?
        "In the face of ambiguity, refuse the temptation to guess."
        raise AmbiguousIndexError with integer labels?
        - No, prefer label-based indexing
        """
        labels = self.obj._get_axis(axis)
        is_int_index = _is_integer_index(labels)

        if com.is_integer(obj) and not is_int_index:
            return obj

        try:
            return labels.get_loc(obj)
        except (KeyError, TypeError):
            pass

        if isinstance(obj, slice):

            int_slice = _is_index_slice(obj)
            null_slice = obj.start is None and obj.stop is None
            # could have integers in the first level of the MultiIndex
            position_slice = (int_slice
                              and not labels.inferred_type == 'integer'
                              and not isinstance(labels, MultiIndex))

            start, stop = obj.start, obj.stop

            # last ditch effort: if we are mixed and have integers
            try:
                if 'mixed' in labels.inferred_type and int_slice:
                    if start is not None:
                        i = labels.get_loc(start)
                    if stop is not None:
                        j = labels.get_loc(stop)
                    position_slice = False
            except KeyError:
                if labels.inferred_type == 'mixed-integer':
                    raise

            if null_slice or position_slice:
                slicer = obj
            else:
                try:
                    i, j = labels.slice_locs(start, stop)
                    slicer = slice(i, j, obj.step)
                except Exception:
                    if _is_index_slice(obj):
                        if labels.inferred_type == 'integer':
                            raise
                        slicer = obj
                    else:
                        raise

            return slicer

        elif _is_list_like(obj):
            if com._is_bool_indexer(obj):
                objarr = _check_bool_indexer(labels, obj)
                return objarr
            else:
                objarr = _asarray_tuplesafe(obj)

                # If have integer labels, defer to label-based indexing
                if _is_integer_dtype(objarr) and not is_int_index:
                    return objarr

                # this is not the most robust, but...
                if (isinstance(labels, MultiIndex) and
                    not isinstance(objarr[0], tuple)):
                    level = 0
                    _, indexer = labels.reindex(objarr, level=level)

                    check = labels.levels[0].get_indexer(objarr)
                else:
                    level = None
                    indexer = check = labels.get_indexer(objarr)

                mask = check == -1
                if mask.any():
                    raise KeyError('%s not in index' % objarr[mask])

                return indexer
        else:
            return labels.get_loc(obj)
Exemple #56
0
 def to_tuples(self):
     """Return an Index of tuples of the form (left, right)"""
     return Index(_asarray_tuplesafe(zip(self.left, self.right)))
Exemple #57
0
 def to_tuples(self):
     return Index(_asarray_tuplesafe(zip(self.left, self.right)))
def _get_grouper(obj, key=None, axis=0, level=None, sort=True):
    """
    create and return a BaseGrouper, which is an internal
    mapping of how to create the grouper indexers.
    This may be composed of multiple Grouping objects, indicating
    multiple groupers

    Groupers are ultimately index mappings. They can originate as:
    index mappings, keys to columns, functions, or Groupers

    Groupers enable local references to axis,level,sort, while
    the passed in axis, level, and sort are 'global'.

    This routine tries to figure of what the passing in references
    are and then creates a Grouping for each one, combined into
    a BaseGrouper.

    """

    # The implementation is essentially the same as pandas.core.groupby

    group_axis = obj._get_axis(axis)

    # validate thatthe passed level is compatible with the passed
    # axis of the object
    if level is not None:
        if not isinstance(group_axis, MultiIndex):
            if isinstance(level, compat.string_types):
                if obj.index.name != level:
                    raise ValueError('level name %s is not the name of the '
                                     'index' % level)
            elif level > 0:
                raise ValueError('level > 0 only valid with MultiIndex')

            level = None
            key = group_axis

    # a passed in Grouper, directly convert
    if isinstance(key, Grouper):
        binner, grouper, obj = key._get_grouper(obj)
        if key.key is None:
            return grouper, [], obj
        else:
            return grouper, set([key.key]), obj

    # already have a BaseGrouper, just return it
    elif isinstance(key, BaseGrouper):
        return key, [], obj

    if not isinstance(key, (tuple, list)):
        keys = [key]
    else:
        keys = key

    # what are we after, exactly?
    match_axis_length = len(keys) == len(group_axis)
    any_callable = any(callable(g) or isinstance(g, dict) for g in keys)
    any_arraylike = any(
        isinstance(g, (list, tuple, Series, Index, np.ndarray)) for g in keys)

    try:
        if isinstance(obj, DataFrame):
            all_in_columns = all(g in obj.columns for g in keys)
        else:
            all_in_columns = False
    except Exception:
        all_in_columns = False

    if (not any_callable and not all_in_columns and not any_arraylike
            and match_axis_length and level is None):
        keys = [com._asarray_tuplesafe(keys)]

    if isinstance(level, (tuple, list)):
        if key is None:
            keys = [None] * len(level)
        levels = level
    else:
        levels = [level] * len(keys)

    groupings = []
    exclusions = []

    # if the actual grouper should be obj[key]
    def is_in_axis(key):
        if not _is_label_like(key):
            try:
                obj._data.items.get_loc(key)
            except Exception:
                return False

        return True

    # if the the grouper is obj[name]
    def is_in_obj(gpr):
        try:
            return id(gpr) == id(obj[gpr.name])
        except Exception:
            return False

    for i, (gpr, level) in enumerate(zip(keys, levels)):

        if is_in_obj(gpr):  # df.groupby(df['name'])
            in_axis, name = True, gpr.name
            exclusions.append(name)

        elif is_in_axis(gpr):  # df.groupby('name')
            in_axis, name, gpr = True, gpr, obj[gpr]
            exclusions.append(name)

        else:
            in_axis, name = False, None

        if com.is_categorical_dtype(gpr) and len(gpr) != len(obj):
            raise ValueError(
                "Categorical dtype grouper must have len(grouper) == len(data)"
            )

        ping = Grouping(group_axis,
                        gpr,
                        obj=obj,
                        name=name,
                        level=level,
                        sort=sort,
                        in_axis=in_axis)

        groupings.append(ping)

    if len(groupings) == 0:
        raise ValueError('No group keys passed!')

    # create the internals grouper
    # Modified to insert CustomGrouper
    grouper = CustomGrouper(group_axis, groupings, sort=sort)
    return grouper, exclusions, obj
Exemple #59
0
    def test_int64_overflow(self):

        B = np.concatenate((np.arange(1000), np.arange(1000), np.arange(500)))
        A = np.arange(2500)
        df = DataFrame({'A': A,
                        'B': B,
                        'C': A,
                        'D': B,
                        'E': A,
                        'F': B,
                        'G': A,
                        'H': B,
                        'values': np.random.randn(2500)})

        lg = df.groupby(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H'])
        rg = df.groupby(['H', 'G', 'F', 'E', 'D', 'C', 'B', 'A'])

        left = lg.sum()['values']
        right = rg.sum()['values']

        exp_index, _ = left.index.sortlevel()
        self.assert_index_equal(left.index, exp_index)

        exp_index, _ = right.index.sortlevel(0)
        self.assert_index_equal(right.index, exp_index)

        tups = list(map(tuple, df[['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H'
                                   ]].values))
        tups = com._asarray_tuplesafe(tups)

        expected = df.groupby(tups).sum()['values']

        for k, v in compat.iteritems(expected):
            self.assertEqual(left[k], right[k[::-1]])
            self.assertEqual(left[k], v)
        self.assertEqual(len(left), len(right))

        # GH9096
        values = range(55109)
        data = pd.DataFrame.from_dict({'a': values,
                                       'b': values,
                                       'c': values,
                                       'd': values})
        grouped = data.groupby(['a', 'b', 'c', 'd'])
        self.assertEqual(len(grouped), len(values))

        arr = np.random.randint(-1 << 12, 1 << 12, (1 << 15, 5))
        i = np.random.choice(len(arr), len(arr) * 4)
        arr = np.vstack((arr, arr[i]))  # add sume duplicate rows

        i = np.random.permutation(len(arr))
        arr = arr[i]  # shuffle rows

        df = DataFrame(arr, columns=list('abcde'))
        df['jim'], df['joe'] = np.random.randn(2, len(df)) * 10
        gr = df.groupby(list('abcde'))

        # verify this is testing what it is supposed to test!
        self.assertTrue(is_int64_overflow_possible(gr.grouper.shape))

        # mannually compute groupings
        jim, joe = defaultdict(list), defaultdict(list)
        for key, a, b in zip(map(tuple, arr), df['jim'], df['joe']):
            jim[key].append(a)
            joe[key].append(b)

        self.assertEqual(len(gr), len(jim))
        mi = MultiIndex.from_tuples(jim.keys(), names=list('abcde'))

        def aggr(func):
            f = lambda a: np.fromiter(map(func, a), dtype='f8')
            arr = np.vstack((f(jim.values()), f(joe.values()))).T
            res = DataFrame(arr, columns=['jim', 'joe'], index=mi)
            return res.sort_index()

        assert_frame_equal(gr.mean(), aggr(np.mean))
        assert_frame_equal(gr.median(), aggr(np.median))
Exemple #60
0
    def _convert_to_indexer(self, obj, axis=0):
        """
        Convert indexing key into something we can use to do actual fancy
        indexing on an ndarray

        Examples
        ix[:5] -> slice(0, 5)
        ix[[1,2,3]] -> [1,2,3]
        ix[['foo', 'bar', 'baz']] -> [i, j, k] (indices of foo, bar, baz)

        Going by Zen of Python?
        "In the face of ambiguity, refuse the temptation to guess."
        raise AmbiguousIndexError with integer labels?
        - No, prefer label-based indexing
        """
        labels = self.obj._get_axis(axis)
        is_int_index = _is_integer_index(labels)

        if com.is_integer(obj) and not is_int_index:
            return obj

        try:
            return labels.get_loc(obj)
        except (KeyError, TypeError):
            pass

        if isinstance(obj, slice):
            ltype = labels.inferred_type

            # in case of providing all floats, use label-based indexing
            float_slice = (labels.inferred_type == 'floating'
                           and _is_float_slice(obj))

            # floats that are within tolerance of int used as positions
            int_slice = _is_index_slice(obj)

            null_slice = obj.start is None and obj.stop is None

            # could have integers in the first level of the MultiIndex,
            # in which case we wouldn't want to do position-based slicing
            position_slice = (int_slice
                              and not ltype == 'integer'
                              and not isinstance(labels, MultiIndex)
                              and not float_slice)

            start, stop = obj.start, obj.stop

            # last ditch effort: if we are mixed and have integers
            try:
                if position_slice and 'mixed' in ltype:
                    if start is not None:
                        i = labels.get_loc(start)
                    if stop is not None:
                        j = labels.get_loc(stop)
                    position_slice = False
            except KeyError:
                if ltype == 'mixed-integer-float':
                    raise

            if null_slice or position_slice:
                indexer = obj
            else:
                try:
                    indexer = labels.slice_indexer(start, stop, obj.step)
                except Exception:
                    if _is_index_slice(obj):
                        if ltype == 'integer':
                            raise
                        indexer = obj
                    else:
                        raise

            return indexer

        elif _is_list_like(obj):
            if com._is_bool_indexer(obj):
                obj = _check_bool_indexer(labels, obj)
                inds, = obj.nonzero()
                return inds
            else:
                if isinstance(obj, Index):
                    objarr = obj.values
                else:
                    objarr = _asarray_tuplesafe(obj)

                # If have integer labels, defer to label-based indexing
                if _is_integer_dtype(objarr) and not is_int_index:
                    if labels.inferred_type != 'integer':
                        objarr = np.where(objarr < 0,
                                          len(labels) + objarr, objarr)
                    return objarr

                # this is not the most robust, but...
                if (isinstance(labels, MultiIndex) and
                        not isinstance(objarr[0], tuple)):
                    level = 0
                    _, indexer = labels.reindex(objarr, level=level)

                    check = labels.levels[0].get_indexer(objarr)
                else:
                    level = None

                    # unique index
                    if labels.is_unique:
                        indexer = check = labels.get_indexer(objarr)

                    # non-unique (dups)
                    else:
                        indexer, missing = labels.get_indexer_non_unique(objarr)
                        check = indexer

                mask = check == -1
                if mask.any():
                    raise KeyError('%s not in index' % objarr[mask])

                return indexer

        else:
            return labels.get_loc(obj)