def _convert_arr_indexer(self, keyarr): # Cast the indexer to uint64 if possible so # that the values returned from indexing are # also uint64. keyarr = _asarray_tuplesafe(keyarr) if is_integer_dtype(keyarr): return _asarray_tuplesafe(keyarr, dtype=np.uint64) return keyarr
def drop(self, labels): """ Make new MultiIndex with passed list of labels deleted Parameters ---------- labels : array-like Must be a list of tuples Returns ------- dropped : MultiIndex """ try: if not isinstance(labels, np.ndarray): labels = _asarray_tuplesafe(labels) indexer, mask = self.get_indexer(labels) if not mask.all(): raise ValueError('labels %s not contained in axis' % labels[-mask]) return self.delete(indexer) except Exception: pass inds = [] for label in labels: loc = self.get_loc(label) if isinstance(loc, int): inds.append(loc) else: inds.extend(range(loc.start, loc.stop)) return self.delete(inds)
def match(to_match, values, na_sentinel=-1): """ Compute locations of to_match into values Parameters ---------- to_match : array-like values to find positions of values : array-like Unique set of values na_sentinel : int, default -1 Value to mark "not found" Examples -------- Returns ------- match : ndarray of integers """ values = com._asarray_tuplesafe(values) if issubclass(values.dtype.type, string_types): values = np.array(values, dtype='O') f = lambda htype, caster: _match_generic(to_match, values, htype, caster) result = _hashtable_algo(f, values.dtype) if na_sentinel != -1: # replace but return a numpy array # use a Series because it handles dtype conversions properly from pandas.core.series import Series result = Series(result.ravel()).replace(-1,na_sentinel).values.reshape(result.shape) return result
def convert(values, unit, axis): def try_parse(values): try: return _dt_to_float_ordinal(tools.to_datetime(values)) except Exception: return values if isinstance(values, (datetime, pydt.date)): return _dt_to_float_ordinal(values) elif isinstance(values, pydt.time): return dates.date2num(values) elif (com.is_integer(values) or com.is_float(values)): return values elif isinstance(values, compat.string_types): return try_parse(values) elif isinstance(values, (list, tuple, np.ndarray)): if not isinstance(values, np.ndarray): values = com._asarray_tuplesafe(values) if com.is_integer_dtype(values) or com.is_float_dtype(values): return values try: values = tools.to_datetime(values) if isinstance(values, Index): values = values.map(_dt_to_float_ordinal) else: values = [_dt_to_float_ordinal(x) for x in values] except Exception: pass return values
def factorize(values, sort=False, order=None, na_sentinel=-1): """ Encode input values as an enumerated type or categorical variable Parameters ---------- values : sequence sort : order : Returns ------- """ hash_klass, values = _get_hash_table_and_cast(values) uniques = [] table = hash_klass(len(values)) labels, counts = table.get_labels(values, uniques, 0, na_sentinel) uniques = com._asarray_tuplesafe(uniques) if sort and len(counts) > 0: sorter = uniques.argsort() reverse_indexer = np.empty(len(sorter), dtype=np.int32) reverse_indexer.put(sorter, np.arange(len(sorter))) mask = labels < 0 labels = reverse_indexer.take(labels) np.putmask(labels, mask, -1) uniques = uniques.take(sorter) counts = counts.take(sorter) return labels, uniques, counts
def _convert_arr_indexer(self, keyarr): keyarr = com._asarray_tuplesafe(keyarr) if self.categories._defer_to_indexing: return keyarr return self._shallow_copy(keyarr)
def match(to_match, values, na_sentinel=-1): """ Compute locations of to_match into values Parameters ---------- to_match : array-like values to find positions of values : array-like Unique set of values na_sentinel : int, default -1 Value to mark "not found" Examples -------- Returns ------- match : ndarray of integers """ values = com._asarray_tuplesafe(values) if issubclass(values.dtype.type, compat.string_types): values = np.array(values, dtype='O') f = lambda htype, caster: _match_generic(to_match, values, htype, caster) return _hashtable_algo(f, values.dtype)
def _getitem_iterable(self, key, axis=0): labels = self.obj._get_axis(axis) def _reindex(keys, level=None): try: return self.obj.reindex_axis(keys, axis=axis, level=level) except AttributeError: # Series assert(axis == 0) return self.obj.reindex(keys, level=level) if com._is_bool_indexer(key): key = _check_bool_indexer(labels, key) return _reindex(labels[np.asarray(key)]) else: if isinstance(key, Index): # want Index objects to pass through untouched keyarr = key else: # asarray can be unsafe, NumPy strings are weird keyarr = _asarray_tuplesafe(key) if _is_integer_dtype(keyarr) and not _is_integer_index(labels): keyarr = labels.take(keyarr) # this is not the most robust, but... if (isinstance(labels, MultiIndex) and not isinstance(keyarr[0], tuple)): level = 0 else: level = None return _reindex(keyarr, level=level)
def to_tuples(self, na_tuple=True): """ Return an Index of tuples of the form (left, right) Parameters ---------- na_tuple : boolean, default True Returns NA as a tuple if True, ``(nan, nan)``, or just as the NA value itself if False, ``nan``. ..versionadded:: 0.23.0 Examples -------- >>> idx = pd.IntervalIndex.from_arrays([0, np.nan, 2], [1, np.nan, 3]) >>> idx.to_tuples() Index([(0.0, 1.0), (nan, nan), (2.0, 3.0)], dtype='object') >>> idx.to_tuples(na_tuple=False) Index([(0.0, 1.0), nan, (2.0, 3.0)], dtype='object') """ tuples = _asarray_tuplesafe(zip(self.left, self.right)) if not na_tuple: # GH 18756 tuples = np.where(~self._isnan, tuples, np.nan) return Index(tuples)
def _convert_to_indexer(self, obj, axis=0): """ Convert indexing key into something we can use to do actual fancy indexing on an ndarray Examples ix[:5] -> slice(0, 5) ix[[1,2,3]] -> [1,2,3] ix[['foo', 'bar', 'baz']] -> [i, j, k] (indices of foo, bar, baz) Going by Zen of Python? "In the face of ambiguity, refuse the temptation to guess." raise AmbiguousIndexError with integer labels? - No, prefer label-based indexing """ index = self.obj._get_axis(axis) try: return index.get_loc(obj) except (KeyError, TypeError): pass is_int_index = _is_integer_index(index) if isinstance(obj, slice): if _is_label_slice(index, obj): i, j = index.slice_locs(obj.start, obj.stop) if obj.step is not None: raise Exception('Non-zero step not supported with ' 'label-based slicing') return slice(i, j) else: return obj elif _is_list_like(obj): objarr = _asarray_tuplesafe(obj) if objarr.dtype == np.bool_: if not obj.index.equals(index): raise IndexingError('Cannot use boolean index with ' 'misaligned or unequal labels') return objarr else: # If have integer labels, defer to label-based indexing if _is_integer_dtype(objarr) and not is_int_index: return objarr indexer = index.get_indexer(objarr) mask = indexer == -1 if mask.any(): raise KeyError('%s not in index' % objarr[mask]) return indexer else: if com.is_integer(obj) and not is_int_index: return obj return index.get_loc(obj)
def _get_grouper(obj, key=None, axis=0, level=None, sort=True): group_axis = obj._get_axis(axis) if level is not None and not isinstance(group_axis, MultiIndex): raise ValueError('can only specify level with multi-level index') if not isinstance(key, (tuple, list)): keys = [key] else: keys = key # what are we after, exactly? match_axis_length = len(keys) == len(group_axis) any_callable = any(callable(g) or isinstance(g, dict) for g in keys) any_arraylike = any(isinstance(g, (list, tuple, np.ndarray)) for g in keys) try: if isinstance(obj, DataFrame): all_in_columns = all(g in obj.columns for g in keys) else: all_in_columns = False except Exception: all_in_columns = False if (not any_callable and not all_in_columns and not any_arraylike and match_axis_length and not level): keys = [com._asarray_tuplesafe(keys)] if isinstance(level, (tuple, list)): if key is None: keys = [None] * len(level) levels = level else: levels = [level] * len(keys) groupings = [] exclusions = [] for i, (gpr, level) in enumerate(zip(keys, levels)): name = None if _is_label_like(gpr): exclusions.append(gpr) name = gpr gpr = obj[gpr] ping = Grouping(group_axis, gpr, name=name, level=level, sort=sort) if ping.name is None: ping.name = 'key_%d' % i groupings.append(ping) grouper = Grouper(group_axis, groupings, sort=sort) return grouper, exclusions
def get_kwargs_from_breaks(self, breaks, closed='right'): """ converts intervals in breaks format to a dictionary of kwargs to specific to the format expected by IntervalIndex.from_tuples """ if len(breaks) == 0: return {'data': breaks} tuples = lzip(breaks[:-1], breaks[1:]) if isinstance(breaks, (list, tuple)): return {'data': tuples} return {'data': com._asarray_tuplesafe(tuples)}
def _getitem_iterable(self, key, axis=0): labels = self.obj._get_axis(axis) def _reindex(keys, level=None): try: return self.obj.reindex_axis(keys, axis=axis, level=level) except AttributeError: # Series if axis != 0: raise AssertionError('axis must be 0') return self.obj.reindex(keys, level=level) if com._is_bool_indexer(key): key = _check_bool_indexer(labels, key) inds, = np.asarray(key, dtype=bool).nonzero() return self.obj.take(inds, axis=axis) else: was_index = isinstance(key, Index) if was_index: # want Index objects to pass through untouched keyarr = key else: # asarray can be unsafe, NumPy strings are weird keyarr = _asarray_tuplesafe(key) if _is_integer_dtype(keyarr): if labels.inferred_type != 'integer': keyarr = np.where(keyarr < 0, len(labels) + keyarr, keyarr) if labels.inferred_type == 'mixed-integer': indexer = labels.get_indexer(keyarr) if (indexer >= 0).all(): self.obj.take(indexer, axis=axis) else: return self.obj.take(keyarr, axis=axis) elif not labels.inferred_type == 'integer': return self.obj.take(keyarr, axis=axis) # this is not the most robust, but... if (isinstance(labels, MultiIndex) and not isinstance(keyarr[0], tuple)): level = 0 else: level = None if labels.is_unique: return _reindex(keyarr, level=level) else: mask = labels.isin(keyarr) return self.obj.take(mask.nonzero()[0], axis=axis)
def __new__(cls, data, dtype=None, copy=False, name=None): if isinstance(data, np.ndarray): if dtype is None and issubclass(data.dtype.type, np.integer): return Int64Index(data, copy=copy, name=name) subarr = np.array(data, dtype=object, copy=copy) elif np.isscalar(data): raise ValueError("Index(...) must be called with a collection " "of some kind, %s was passed" % repr(data)) else: # other iterable of some kind subarr = _asarray_tuplesafe(data, dtype=object) subarr = subarr.view(cls) subarr.name = name return subarr
def _prep_window(self, **kwargs): """ provide validation for our window type, return the window """ window = self._get_window() if isinstance(window, (list, tuple, np.ndarray)): return com._asarray_tuplesafe(window).astype(float) elif com.is_integer(window): try: import scipy.signal as sig except ImportError: raise ImportError('Please install scipy to generate window weight') win_type = _validate_win_type(self.win_type, kwargs) # may pop from kwargs return sig.get_window(win_type, window).astype(float) raise ValueError('Invalid window %s' % str(window))
def unique(values): """ Compute unique values (not necessarily sorted) efficiently from input array of values Parameters ---------- values : array-like Returns ------- uniques """ values = com._asarray_tuplesafe(values) f = lambda htype, caster: _unique_generic(values, htype, caster) return _hashtable_algo(f, values.dtype)
def _read_panel_table(self, group, where=None): from pandas.core.common import _asarray_tuplesafe table = getattr(group, 'table') # create the selection sel = Selection(table, where) sel.select() fields = table._v_attrs.fields columns = _maybe_convert(sel.values['column'], table._v_attrs.columns_kind) index = _maybe_convert(sel.values['index'], table._v_attrs.index_kind) # reconstruct long_index = MultiIndex.from_arrays([index, columns]) lp = LongPanel(sel.values['values'], index=long_index, columns=fields) if lp.consistent: lp = lp.sortlevel(level=0) wp = lp.to_wide() else: if not self._quiet: # pragma: no cover print ('Duplicate entries in table, taking most recently ' 'appended') # need a better algorithm tuple_index = long_index.get_tuple_index() index_map = lib.map_indices_object(tuple_index) unique_tuples = lib.fast_unique(tuple_index) unique_tuples = _asarray_tuplesafe(unique_tuples) indexer = lib.merge_indexer_object(unique_tuples, index_map) new_index = long_index.take(indexer) new_values = lp.values.take(indexer, axis=0) lp = LongPanel(new_values, index=new_index, columns=lp.columns) wp = lp.to_wide() if sel.column_filter: new_minor = sorted(set(wp.minor_axis) & sel.column_filter) wp = wp.reindex(minor=new_minor) return wp
def _prep_window(self, **kwargs): """ provide validation for our window type, return the window """ window = self._get_window() if isinstance(window, (list, tuple, np.ndarray)): return com._asarray_tuplesafe(window).astype(float) elif com.is_integer(window): try: import scipy.signal as sig except ImportError: raise ImportError('Please install scipy to generate window ' 'weight') # the below may pop from kwargs win_type = _validate_win_type(self.win_type, kwargs) return sig.get_window(win_type, window).astype(float) raise ValueError('Invalid window %s' % str(window))
def _getitem_iterable(self, key, axis=0): labels = self.obj._get_axis(axis) def _reindex(keys, level=None): try: return self.obj.reindex_axis(keys, axis=axis, level=level) except AttributeError: # Series assert(axis == 0) return self.obj.reindex(keys, level=level) if com._is_bool_indexer(key): key = _check_bool_indexer(labels, key) inds, = np.asarray(key, dtype=bool).nonzero() return self.obj.take(inds, axis=axis) else: if isinstance(key, Index): # want Index objects to pass through untouched keyarr = key else: # asarray can be unsafe, NumPy strings are weird keyarr = _asarray_tuplesafe(key) if _is_integer_dtype(keyarr): if labels.inferred_type == 'mixed-integer': indexer = labels.get_indexer(keyarr) if (indexer >= 0).all(): self.obj.take(indexer, axis=axis) else: return self.obj.take(keyarr, axis=axis) elif not labels.inferred_type == 'integer': return self.obj.take(keyarr, axis=axis) # this is not the most robust, but... if (isinstance(labels, MultiIndex) and not isinstance(keyarr[0], tuple)): level = 0 else: level = None if labels.is_unique: return _reindex(keyarr, level=level) else: mask = labels.isin(keyarr) return self.obj.take(mask.nonzero()[0], axis=axis)
def _read_panel_table(self, group, where=None): from pandas.core.common import _asarray_tuplesafe table = getattr(group, 'table') # create the selection sel = Selection(table, where) sel.select() fields = table._v_attrs.fields columns = _maybe_convert(sel.values['column'], table._v_attrs.columns_kind) index = _maybe_convert(sel.values['index'], table._v_attrs.index_kind) # reconstruct long_index = MultiIndex.from_arrays([index, columns]) lp = LongPanel(sel.values['values'], index=long_index, columns=fields) if lp.consistent: lp = lp.sortlevel(level=0) wp = lp.to_wide() else: if not self._quiet: # pragma: no cover print( 'Duplicate entries in table, taking most recently ' 'appended') # need a better algorithm tuple_index = long_index.get_tuple_index() index_map = lib.map_indices_object(tuple_index) unique_tuples = lib.fast_unique(tuple_index) unique_tuples = _asarray_tuplesafe(unique_tuples) indexer = lib.merge_indexer_object(unique_tuples, index_map) new_index = long_index.take(indexer) new_values = lp.values.take(indexer, axis=0) lp = LongPanel(new_values, index=new_index, columns=lp.columns) wp = lp.to_wide() if sel.column_filter: new_minor = sorted(set(wp.minor_axis) & sel.column_filter) wp = wp.reindex(minor=new_minor) return wp
def _convert_for_reindex(self, key, axis=0): labels = self.obj._get_axis(axis) if com._is_bool_indexer(key): key = _check_bool_indexer(labels, key) return labels[np.asarray(key)] else: if isinstance(key, Index): # want Index objects to pass through untouched keyarr = key else: # asarray can be unsafe, NumPy strings are weird keyarr = _asarray_tuplesafe(key) if _is_integer_dtype(keyarr) and not _is_integer_index(labels): return labels.take(keyarr) return keyarr
def test_to_tuples_na(self, tuples, na_tuple): # GH 18756 idx = IntervalIndex.from_tuples(tuples) result = idx.to_tuples(na_tuple=na_tuple) # check the non-NA portion expected_notna = Index(com._asarray_tuplesafe(tuples[:-1])) result_notna = result[:-1] tm.assert_index_equal(result_notna, expected_notna) # check the NA portion result_na = result[-1] if na_tuple: assert isinstance(result_na, tuple) assert len(result_na) == 2 assert all(isna(x) for x in result_na) else: assert isna(result_na)
def _getitem_iterable(self, key, axis=0): labels = self.obj._get_axis(axis) axis_name = self.obj._get_axis_name(axis) if com._is_bool_indexer(key): key = _check_bool_indexer(labels, key) return self.obj.reindex(**{axis_name: labels[np.asarray(key)]}) else: if isinstance(key, Index): # want Index objects to pass through untouched keyarr = key else: # asarray can be unsafe, NumPy strings are weird keyarr = _asarray_tuplesafe(key) if _is_integer_dtype(keyarr) and not _is_integer_index(labels): keyarr = labels.take(keyarr) return self.obj.reindex(**{axis_name: keyarr})
def __init__(self, index, grouper=None, name=None, level=None, sort=True): self.name = name self.level = level self.grouper = _convert_grouper(index, grouper) self.index = index self.sort = sort # right place for this? if isinstance(grouper, Series) and name is None: self.name = grouper.name # pre-computed self._was_factor = False if level is not None: if not isinstance(level, int): assert(level in index.names) level = index.names.index(level) inds = index.labels[level] level_index = index.levels[level] if self.name is None: self.name = index.names[level] # XXX complete hack level_values = index.levels[level].take(inds) if grouper is not None: self.grouper = level_values.map(self.grouper) else: self._was_factor = True self._labels = inds self._group_index = level_index self.grouper = level_values else: if isinstance(self.grouper, (list, tuple)): self.grouper = com._asarray_tuplesafe(self.grouper) # no level passed if not isinstance(self.grouper, np.ndarray): self.grouper = self.index.map(self.grouper)
def __init__(self, index, grouper=None, name=None, level=None, sort=True): self.name = name self.level = level self.grouper = _convert_grouper(index, grouper) self.index = index self.sort = sort # right place for this? if isinstance(grouper, Series) and name is None: self.name = grouper.name # pre-computed self._was_factor = False if level is not None: if not isinstance(level, int): assert (level in index.names) level = index.names.index(level) inds = index.labels[level] level_index = index.levels[level] if self.name is None: self.name = index.names[level] # XXX complete hack level_values = index.levels[level].take(inds) if grouper is not None: self.grouper = level_values.map(self.grouper) else: self._was_factor = True self._labels = inds self._group_index = level_index self.grouper = level_values else: if isinstance(self.grouper, (list, tuple)): self.grouper = com._asarray_tuplesafe(self.grouper) # no level passed if not isinstance(self.grouper, np.ndarray): self.grouper = self.index.map(self.grouper)
def _fancy_getitem_iterable(self, key, axis=0): from pandas.core.series import Series labels = self.frame._get_axis(axis) axis_name = self.frame._get_axis_name(axis) # asarray can be unsafe, NumPy strings are weird keyarr = _asarray_tuplesafe(key) if keyarr.dtype == np.bool_: if isinstance(key, Series): if not key.index.equals(labels): raise Exception('Cannot use boolean index with misaligned ' 'or unequal labels') return self.frame.reindex(**{axis_name : labels[np.asarray(key)]}) else: if _is_integer_dtype(keyarr) and not _is_integer_index(labels): keyarr = labels.take(keyarr) return self.frame.reindex(**{axis_name : keyarr})
def drop(self, labels): try: if not isinstance(labels, np.ndarray): labels = _asarray_tuplesafe(labels) indexer, mask = self.get_indexer(labels) if not mask.all(): raise ValueError('labels %s not contained in axis' % labels[-mask]) return self.delete(indexer) except Exception: pass inds = [] for label in labels: loc = self.get_loc(label) if isinstance(loc, int): inds.append(loc) else: inds.extend(range(loc.start, loc.stop)) return self.delete(inds)
def factorize(values, sort=False, order=None, na_sentinel=-1): """ Encode input values as an enumerated type or categorical variable Parameters ---------- values : sequence sort : order : Returns ------- """ values = np.asarray(values) is_datetime = com.is_datetime64_dtype(values) hash_klass, values = _get_data_algo(values, _hashtables) uniques = [] table = hash_klass(len(values)) labels, counts = table.get_labels(values, uniques, 0, na_sentinel) labels = com._ensure_platform_int(labels) uniques = com._asarray_tuplesafe(uniques) if sort and len(counts) > 0: sorter = uniques.argsort() reverse_indexer = np.empty(len(sorter), dtype=np.int_) reverse_indexer.put(sorter, np.arange(len(sorter))) mask = labels < 0 labels = reverse_indexer.take(labels) np.putmask(labels, mask, -1) uniques = uniques.take(sorter) counts = counts.take(sorter) if is_datetime: uniques = np.array(uniques, dtype='M8[ns]') return labels, uniques, counts
def _getitem_iterable(self, key, axis=0): labels = self.obj._get_axis(axis) axis_name = self.obj._get_axis_name(axis) # asarray can be unsafe, NumPy strings are weird if isinstance(key, Index): # want Index objects to pass through untouched keyarr = key else: keyarr = _asarray_tuplesafe(key) if keyarr.dtype == np.bool_: if _is_series(key): if not key.index.equals(labels): raise IndexingError('Cannot use boolean index with ' 'misaligned or unequal labels') return self.obj.reindex(**{axis_name: labels[np.asarray(key)]}) else: if _is_integer_dtype(keyarr) and not _is_integer_index(labels): keyarr = labels.take(keyarr) return self.obj.reindex(**{axis_name: keyarr})
def _getitem_iterable(self, key, axis=0): labels = self.obj._get_axis(axis) axis_name = self.obj._get_axis_name(axis) # asarray can be unsafe, NumPy strings are weird if isinstance(key, Index): # want Index objects to pass through untouched keyarr = key else: keyarr = _asarray_tuplesafe(key) if keyarr.dtype == np.bool_: if _is_series(key): if not key.index.equals(labels): raise IndexingError('Cannot use boolean index with ' 'misaligned or unequal labels') return self.obj.reindex(**{axis_name : labels[np.asarray(key)]}) else: if _is_integer_dtype(keyarr) and not _is_integer_index(labels): keyarr = labels.take(keyarr) return self.obj.reindex(**{axis_name : keyarr})
def test_int64_overflow(self): B = np.concatenate((np.arange(1000), np.arange(1000), np.arange(500))) A = np.arange(2500) df = DataFrame({ 'A': A, 'B': B, 'C': A, 'D': B, 'E': A, 'F': B, 'G': A, 'H': B, 'values': np.random.randn(2500) }) lg = df.groupby(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']) rg = df.groupby(['H', 'G', 'F', 'E', 'D', 'C', 'B', 'A']) left = lg.sum()['values'] right = rg.sum()['values'] exp_index, _ = left.index.sortlevel() tm.assert_index_equal(left.index, exp_index) exp_index, _ = right.index.sortlevel(0) tm.assert_index_equal(right.index, exp_index) tups = list( map(tuple, df[['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']].values)) tups = com._asarray_tuplesafe(tups) expected = df.groupby(tups).sum()['values'] for k, v in compat.iteritems(expected): assert left[k] == right[k[::-1]] assert left[k] == v assert len(left) == len(right)
def test_int64_overflow(self): B = np.concatenate((np.arange(1000), np.arange(1000), np.arange(500))) A = np.arange(2500) df = DataFrame({'A': A, 'B': B, 'C': A, 'D': B, 'E': A, 'F': B, 'G': A, 'H': B, 'values': np.random.randn(2500)}) lg = df.groupby(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']) rg = df.groupby(['H', 'G', 'F', 'E', 'D', 'C', 'B', 'A']) left = lg.sum()['values'] right = rg.sum()['values'] exp_index, _ = left.index.sortlevel() tm.assert_index_equal(left.index, exp_index) exp_index, _ = right.index.sortlevel(0) tm.assert_index_equal(right.index, exp_index) tups = list(map(tuple, df[['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H' ]].values)) tups = com._asarray_tuplesafe(tups) expected = df.groupby(tups).sum()['values'] for k, v in compat.iteritems(expected): assert left[k] == right[k[::-1]] assert left[k] == v assert len(left) == len(right)
def _convert_1d(values, unit, axis): def try_parse(values): try: return _dt_to_float_ordinal(tools.to_datetime(values)) except Exception: return values if isinstance(values, (datetime, pydt.date)): return _dt_to_float_ordinal(values) elif isinstance(values, np.datetime64): return _dt_to_float_ordinal(lib.Timestamp(values)) elif isinstance(values, pydt.time): return dates.date2num(values) elif (is_integer(values) or is_float(values)): return values elif isinstance(values, compat.string_types): return try_parse(values) elif isinstance(values, (list, tuple, np.ndarray, Index)): if isinstance(values, Index): values = values.values if not isinstance(values, np.ndarray): values = com._asarray_tuplesafe(values) if is_integer_dtype(values) or is_float_dtype(values): return values try: values = tools.to_datetime(values) if isinstance(values, Index): values = _dt_to_float_ordinal(values) else: values = [_dt_to_float_ordinal(x) for x in values] except Exception: values = _dt_to_float_ordinal(values) return values
def match(to_match, values, na_sentinel=-1): """ Compute locations of to_match into values Parameters ---------- to_match : array-like values to find positions of values : array-like Unique set of values na_sentinel : int, default -1 Value to mark "not found" Examples -------- Returns ------- match : ndarray of integers """ values = com._asarray_tuplesafe(values) if issubclass(values.dtype.type, string_types): values = np.array(values, dtype='O') f = lambda htype, caster: _match_generic(to_match, values, htype, caster) result = _hashtable_algo(f, values.dtype, np.int64) if na_sentinel != -1: # replace but return a numpy array # use a Series because it handles dtype conversions properly from pandas.core.series import Series result = Series(result.ravel()).replace( -1, na_sentinel).values.reshape(result.shape) return result
def test_to_tuples(self, tuples): # GH 18756 idx = IntervalIndex.from_tuples(tuples) result = idx.to_tuples() expected = Index(com._asarray_tuplesafe(tuples)) tm.assert_index_equal(result, expected)
def _convert_to_indexer(self, obj, axis=0): """ Convert indexing key into something we can use to do actual fancy indexing on an ndarray Examples ix[:5] -> slice(0, 5) ix[[1,2,3]] -> [1,2,3] ix[['foo', 'bar', 'baz']] -> [i, j, k] (indices of foo, bar, baz) Going by Zen of Python? "In the face of ambiguity, refuse the temptation to guess." raise AmbiguousIndexError with integer labels? - No, prefer label-based indexing """ labels = self.obj._get_axis(axis) is_int_index = _is_integer_index(labels) if com.is_integer(obj) and not is_int_index: return obj try: return labels.get_loc(obj) except (KeyError, TypeError): pass if isinstance(obj, slice): ltype = labels.inferred_type if ltype == 'floating': int_slice = _is_int_slice(obj) else: # floats that are within tolerance of int used int_slice = _is_index_slice(obj) null_slice = obj.start is None and obj.stop is None # could have integers in the first level of the MultiIndex position_slice = (int_slice and not ltype == 'integer' and not isinstance(labels, MultiIndex)) start, stop = obj.start, obj.stop # last ditch effort: if we are mixed and have integers try: if 'mixed' in ltype and int_slice: if start is not None: i = labels.get_loc(start) if stop is not None: j = labels.get_loc(stop) position_slice = False except KeyError: if ltype == 'mixed-integer-float': raise if null_slice or position_slice: slicer = obj else: try: i, j = labels.slice_locs(start, stop) slicer = slice(i, j, obj.step) except Exception: if _is_index_slice(obj): if labels.inferred_type == 'integer': raise slicer = obj else: raise return slicer elif _is_list_like(obj): if com._is_bool_indexer(obj): objarr = _check_bool_indexer(labels, obj) return objarr else: if isinstance(obj, Index): objarr = obj.values else: objarr = _asarray_tuplesafe(obj) # If have integer labels, defer to label-based indexing if _is_integer_dtype(objarr) and not is_int_index: if labels.inferred_type != 'integer': objarr = np.where(objarr < 0, len(labels) + objarr, objarr) return objarr # this is not the most robust, but... if (isinstance(labels, MultiIndex) and not isinstance(objarr[0], tuple)): level = 0 _, indexer = labels.reindex(objarr, level=level) check = labels.levels[0].get_indexer(objarr) else: level = None # XXX if labels.is_unique: indexer = check = labels.get_indexer(objarr) else: mask = np.zeros(len(labels), dtype=bool) lvalues = labels.values for x in objarr: # ugh to_or = lib.map_infer(lvalues, x.__eq__) if not to_or.any(): raise KeyError('%s not in index' % str(x)) mask |= to_or indexer = check = mask.nonzero()[0] mask = check == -1 if mask.any(): raise KeyError('%s not in index' % objarr[mask]) return indexer else: return labels.get_loc(obj)
def _getitem_iterable(self, key, axis=0): labels = self.obj._get_axis(axis) def _reindex(keys, level=None): try: return self.obj.reindex_axis(keys, axis=axis, level=level) except AttributeError: # Series if axis != 0: raise AssertionError('axis must be 0') return self.obj.reindex(keys, level=level) if com._is_bool_indexer(key): key = _check_bool_indexer(labels, key) inds, = key.nonzero() return self.obj.take(inds, axis=axis, convert=False) else: if isinstance(key, Index): # want Index objects to pass through untouched keyarr = key else: # asarray can be unsafe, NumPy strings are weird keyarr = _asarray_tuplesafe(key) if _is_integer_dtype(keyarr): if labels.inferred_type != 'integer': keyarr = np.where(keyarr < 0, len(labels) + keyarr, keyarr) if labels.inferred_type == 'mixed-integer': indexer = labels.get_indexer(keyarr) if (indexer >= 0).all(): self.obj.take(indexer, axis=axis, convert=True) else: return self.obj.take(keyarr, axis=axis) elif not labels.inferred_type == 'integer': return self.obj.take(keyarr, axis=axis) # this is not the most robust, but... if (isinstance(labels, MultiIndex) and not isinstance(keyarr[0], tuple)): level = 0 else: level = None if labels.is_unique and Index(keyarr).is_unique: return _reindex(keyarr, level=level) else: indexer, missing = labels.get_indexer_non_unique(keyarr) check = indexer != -1 result = self.obj.take(indexer[check], axis=axis, convert=False) # need to merge the result labels and the missing labels if len(missing): l = np.arange(len(indexer)) missing = com._ensure_platform_int(missing) missing_labels = keyarr.take(missing) missing_indexer = com._ensure_int64(l[~check]) cur_labels = result._get_axis(axis).values cur_indexer = com._ensure_int64(l[check]) new_labels = np.empty(tuple([len(indexer)]),dtype=object) new_labels[cur_indexer] = cur_labels new_labels[missing_indexer] = missing_labels new_indexer = (Index(cur_indexer) + Index(missing_indexer)).values new_indexer[missing_indexer] = -1 # reindex with the specified axis ndim = self.obj.ndim if axis+1 > ndim: raise AssertionError("invalid indexing error with non-unique index") args = [None] * (2*ndim) args[2*axis] = new_labels args[2*axis+1] = new_indexer result = result._reindex_with_indexers(*args, copy=False, fill_value=np.nan) return result
def rolling_window(arg, window=None, win_type=None, min_periods=None, freq=None, center=False, mean=True, axis=0, how=None, **kwargs): """ Applies a moving window of type ``window_type`` and size ``window`` on the data. Parameters ---------- arg : Series, DataFrame window : int or ndarray Weighting window specification. If the window is an integer, then it is treated as the window length and win_type is required win_type : str, default None Window type (see Notes) min_periods : int, default None Minimum number of observations in window required to have a value (otherwise result is NA). freq : string or DateOffset object, optional (default None) Frequency to conform the data to before computing the statistic. Specified as a frequency string or DateOffset object. center : boolean, default False Whether the label should correspond with center of window mean : boolean, default True If True computes weighted mean, else weighted sum axis : {0, 1}, default 0 how : string, default 'mean' Method for down- or re-sampling Returns ------- y : type of input argument Notes ----- The recognized window types are: * ``boxcar`` * ``triang`` * ``blackman`` * ``hamming`` * ``bartlett`` * ``parzen`` * ``bohman`` * ``blackmanharris`` * ``nuttall`` * ``barthann`` * ``kaiser`` (needs beta) * ``gaussian`` (needs std) * ``general_gaussian`` (needs power, width) * ``slepian`` (needs width). By default, the result is set to the right edge of the window. This can be changed to the center of the window by setting ``center=True``. The `freq` keyword is used to conform time series data to a specified frequency by resampling the data. This is done with the default parameters of :meth:`~pandas.Series.resample` (i.e. using the `mean`). """ if isinstance(window, (list, tuple, np.ndarray)): if win_type is not None: raise ValueError(('Do not specify window type if using custom ' 'weights')) window = pdcom._asarray_tuplesafe(window).astype(float) elif pdcom.is_integer(window): # window size if win_type is None: raise ValueError('Must specify window type') try: import scipy.signal as sig except ImportError: raise ImportError('Please install scipy to generate window weight') win_type = _validate_win_type(win_type, kwargs) # may pop from kwargs window = sig.get_window(win_type, window).astype(float) else: raise ValueError('Invalid window %s' % str(window)) minp = _use_window(min_periods, len(window)) arg = _conv_timerule(arg, freq, how) return_hook, values = _process_data_structure(arg) f = lambda x: algos.roll_window(x, window, minp, avg=mean) result = np.apply_along_axis(f, axis, values) rs = return_hook(result) if center: rs = _center_window(rs, len(window), axis) return rs
def rolling_window(arg, window=None, win_type=None, min_periods=None, freq=None, center=False, mean=True, axis=0, how=None, **kwargs): """ Applies a moving window of type ``window_type`` and size ``window`` on the data. Parameters ---------- arg : Series, DataFrame window : int or ndarray Weighting window specification. If the window is an integer, then it is treated as the window length and win_type is required win_type : str, default None Window type (see Notes) min_periods : int, default None Minimum number of observations in window required to have a value (otherwise result is NA). freq : string or DateOffset object, optional (default None) Frequency to conform the data to before computing the statistic. Specified as a frequency string or DateOffset object. center : boolean, default False Whether the label should correspond with center of window mean : boolean, default True If True computes weighted mean, else weighted sum axis : {0, 1}, default 0 how : string, default 'mean' Method for down- or re-sampling Returns ------- y : type of input argument Notes ----- The recognized window types are: * ``boxcar`` * ``triang`` * ``blackman`` * ``hamming`` * ``bartlett`` * ``parzen`` * ``bohman`` * ``blackmanharris`` * ``nuttall`` * ``barthann`` * ``kaiser`` (needs beta) * ``gaussian`` (needs std) * ``general_gaussian`` (needs power, width) * ``slepian`` (needs width). By default, the result is set to the right edge of the window. This can be changed to the center of the window by setting ``center=True``. The `freq` keyword is used to conform time series data to a specified frequency by resampling the data. This is done with the default parameters of :meth:`~pandas.Series.resample` (i.e. using the `mean`). """ if isinstance(window, (list, tuple, np.ndarray)): if win_type is not None: raise ValueError(('Do not specify window type if using custom ' 'weights')) window = pdcom._asarray_tuplesafe(window).astype(float) elif pdcom.is_integer(window): # window size if win_type is None: raise ValueError('Must specify window type') try: import scipy.signal as sig except ImportError: raise ImportError('Please install scipy to generate window weight') win_type = _validate_win_type(win_type, kwargs) # may pop from kwargs window = sig.get_window(win_type, window).astype(float) else: raise ValueError('Invalid window %s' % str(window)) minp = _use_window(min_periods, len(window)) arg = _conv_timerule(arg, freq, how) return_hook, values = _process_data_structure(arg) offset = int((len(window) - 1) / 2.) if center else 0 additional_nans = np.array([np.NaN] * offset) f = lambda x: algos.roll_window(np.concatenate((x, additional_nans)) if center else x, window, minp, avg=mean) result = np.apply_along_axis(f, axis, values) if center: result = _center_window(result, len(window), axis) return return_hook(result)
def _convert_to_indexer(self, obj, axis=0): """ Convert indexing key into something we can use to do actual fancy indexing on an ndarray Examples ix[:5] -> slice(0, 5) ix[[1,2,3]] -> [1,2,3] ix[['foo', 'bar', 'baz']] -> [i, j, k] (indices of foo, bar, baz) Going by Zen of Python? "In the face of ambiguity, refuse the temptation to guess." raise AmbiguousIndexError with integer labels? - No, prefer label-based indexing """ labels = self.obj._get_axis(axis) is_int_index = _is_integer_index(labels) if com.is_integer(obj) and not is_int_index: return obj try: return labels.get_loc(obj) except (KeyError, TypeError): pass if isinstance(obj, slice): int_slice = _is_index_slice(obj) null_slice = obj.start is None and obj.stop is None # could have integers in the first level of the MultiIndex position_slice = (int_slice and not labels.inferred_type == 'integer' and not isinstance(labels, MultiIndex)) start, stop = obj.start, obj.stop # last ditch effort: if we are mixed and have integers try: if 'mixed' in labels.inferred_type and int_slice: if start is not None: i = labels.get_loc(start) if stop is not None: j = labels.get_loc(stop) position_slice = False except KeyError: if labels.inferred_type == 'mixed-integer': raise if null_slice or position_slice: slicer = obj else: try: i, j = labels.slice_locs(start, stop) slicer = slice(i, j, obj.step) except Exception: if _is_index_slice(obj): if labels.inferred_type == 'integer': raise slicer = obj else: raise return slicer elif _is_list_like(obj): if com._is_bool_indexer(obj): objarr = _check_bool_indexer(labels, obj) return objarr else: objarr = _asarray_tuplesafe(obj) # If have integer labels, defer to label-based indexing if _is_integer_dtype(objarr) and not is_int_index: return objarr indexer = labels.get_indexer(objarr) mask = indexer == -1 if mask.any(): raise KeyError('%s not in index' % objarr[mask]) return indexer else: return labels.get_loc(obj)
def __init__(self, index, grouper=None, obj=None, name=None, level=None, sort=True, observed=False, in_axis=False): self.name = name self.level = level self.grouper = _convert_grouper(index, grouper) self.all_grouper = None self.index = index self.sort = sort self.obj = obj self.observed = observed self.in_axis = in_axis # right place for this? if isinstance(grouper, (Series, Index)) and name is None: self.name = grouper.name if isinstance(grouper, MultiIndex): self.grouper = grouper.values # we have a single grouper which may be a myriad of things, # some of which are dependent on the passing in level if level is not None: if not isinstance(level, int): if level not in index.names: raise AssertionError('Level %s not in index' % str(level)) level = index.names.index(level) if self.name is None: self.name = index.names[level] self.grouper, self._labels, self._group_index = \ index._get_grouper_for_level(self.grouper, level) # a passed Grouper like, directly get the grouper in the same way # as single grouper groupby, use the group_info to get labels elif isinstance(self.grouper, Grouper): # get the new grouper; we already have disambiguated # what key/level refer to exactly, don't need to # check again as we have by this point converted these # to an actual value (rather than a pd.Grouper) _, grouper, _ = self.grouper._get_grouper(self.obj, validate=False) if self.name is None: self.name = grouper.result_index.name self.obj = self.grouper.obj self.grouper = grouper else: if self.grouper is None and self.name is not None: self.grouper = self.obj[self.name] elif isinstance(self.grouper, (list, tuple)): self.grouper = com._asarray_tuplesafe(self.grouper) # a passed Categorical elif is_categorical_dtype(self.grouper): from pandas.core.groupby.categorical import recode_for_groupby self.grouper, self.all_grouper = recode_for_groupby( self.grouper, self.sort, observed) categories = self.grouper.categories # we make a CategoricalIndex out of the cat grouper # preserving the categories / ordered attributes self._labels = self.grouper.codes if observed: codes = algorithms.unique1d(self.grouper.codes) else: codes = np.arange(len(categories)) self._group_index = CategoricalIndex( Categorical.from_codes(codes=codes, categories=categories, ordered=self.grouper.ordered)) # we are done if isinstance(self.grouper, Grouping): self.grouper = self.grouper.grouper # no level passed elif not isinstance(self.grouper, (Series, Index, ExtensionArray, np.ndarray)): if getattr(self.grouper, 'ndim', 1) != 1: t = self.name or str(type(self.grouper)) raise ValueError("Grouper for '%s' not 1-dimensional" % t) self.grouper = self.index.map(self.grouper) if not (hasattr(self.grouper, "__len__") and len(self.grouper) == len(self.index)): errmsg = ('Grouper result violates len(labels) == ' 'len(data)\nresult: %s' % pprint_thing(self.grouper)) self.grouper = None # Try for sanity raise AssertionError(errmsg) # if we have a date/time-like grouper, make sure that we have # Timestamps like if getattr(self.grouper, 'dtype', None) is not None: if is_datetime64_dtype(self.grouper): from pandas import to_datetime self.grouper = to_datetime(self.grouper) elif is_timedelta64_dtype(self.grouper): from pandas import to_timedelta self.grouper = to_timedelta(self.grouper)
def _get_grouper(obj, key=None, axis=0, level=None, sort=True, observed=False, mutated=False, validate=True): """ create and return a BaseGrouper, which is an internal mapping of how to create the grouper indexers. This may be composed of multiple Grouping objects, indicating multiple groupers Groupers are ultimately index mappings. They can originate as: index mappings, keys to columns, functions, or Groupers Groupers enable local references to axis,level,sort, while the passed in axis, level, and sort are 'global'. This routine tries to figure out what the passing in references are and then creates a Grouping for each one, combined into a BaseGrouper. If observed & we have a categorical grouper, only show the observed values If validate, then check for key/level overlaps """ group_axis = obj._get_axis(axis) # validate that the passed single level is compatible with the passed # axis of the object if level is not None: # TODO: These if-block and else-block are almost same. # MultiIndex instance check is removable, but it seems that there are # some processes only for non-MultiIndex in else-block, # eg. `obj.index.name != level`. We have to consider carefully whether # these are applicable for MultiIndex. Even if these are applicable, # we need to check if it makes no side effect to subsequent processes # on the outside of this condition. # (GH 17621) if isinstance(group_axis, MultiIndex): if is_list_like(level) and len(level) == 1: level = level[0] if key is None and is_scalar(level): # Get the level values from group_axis key = group_axis.get_level_values(level) level = None else: # allow level to be a length-one list-like object # (e.g., level=[0]) # GH 13901 if is_list_like(level): nlevels = len(level) if nlevels == 1: level = level[0] elif nlevels == 0: raise ValueError('No group keys passed!') else: raise ValueError('multiple levels only valid with ' 'MultiIndex') if isinstance(level, compat.string_types): if obj.index.name != level: raise ValueError('level name %s is not the name of the ' 'index' % level) elif level > 0 or level < -1: raise ValueError('level > 0 or level < -1 only valid with ' ' MultiIndex') # NOTE: `group_axis` and `group_axis.get_level_values(level)` # are same in this section. level = None key = group_axis # a passed-in Grouper, directly convert if isinstance(key, Grouper): binner, grouper, obj = key._get_grouper(obj, validate=False) if key.key is None: return grouper, [], obj else: return grouper, set([key.key]), obj # already have a BaseGrouper, just return it elif isinstance(key, BaseGrouper): return key, [], obj # In the future, a tuple key will always mean an actual key, # not an iterable of keys. In the meantime, we attempt to provide # a warning. We can assume that the user wanted a list of keys when # the key is not in the index. We just have to be careful with # unhashble elements of `key`. Any unhashable elements implies that # they wanted a list of keys. # https://github.com/pandas-dev/pandas/issues/18314 is_tuple = isinstance(key, tuple) all_hashable = is_tuple and is_hashable(key) if is_tuple: if ((all_hashable and key not in obj and set(key).issubset(obj)) or not all_hashable): # column names ('a', 'b') -> ['a', 'b'] # arrays like (a, b) -> [a, b] msg = ("Interpreting tuple 'by' as a list of keys, rather than " "a single key. Use 'by=[...]' instead of 'by=(...)'. In " "the future, a tuple will always mean a single key.") warnings.warn(msg, FutureWarning, stacklevel=5) key = list(key) if not isinstance(key, list): keys = [key] match_axis_length = False else: keys = key match_axis_length = len(keys) == len(group_axis) # what are we after, exactly? any_callable = any(callable(g) or isinstance(g, dict) for g in keys) any_groupers = any(isinstance(g, Grouper) for g in keys) any_arraylike = any( isinstance(g, (list, tuple, Series, Index, np.ndarray)) for g in keys) try: if isinstance(obj, DataFrame): all_in_columns_index = all(g in obj.columns or g in obj.index.names for g in keys) else: all_in_columns_index = False except Exception: all_in_columns_index = False if not any_callable and not all_in_columns_index and \ not any_arraylike and not any_groupers and \ match_axis_length and level is None: keys = [com._asarray_tuplesafe(keys)] if isinstance(level, (tuple, list)): if key is None: keys = [None] * len(level) levels = level else: levels = [level] * len(keys) groupings = [] exclusions = [] # if the actual grouper should be obj[key] def is_in_axis(key): if not _is_label_like(key): try: obj._data.items.get_loc(key) except Exception: return False return True # if the grouper is obj[name] def is_in_obj(gpr): try: return id(gpr) == id(obj[gpr.name]) except Exception: return False for i, (gpr, level) in enumerate(zip(keys, levels)): if is_in_obj(gpr): # df.groupby(df['name']) in_axis, name = True, gpr.name exclusions.append(name) elif is_in_axis(gpr): # df.groupby('name') if gpr in obj: if validate: stacklevel = 5 # Number of stack levels from df.groupby obj._check_label_or_level_ambiguity(gpr, stacklevel=stacklevel) in_axis, name, gpr = True, gpr, obj[gpr] exclusions.append(name) elif obj._is_level_reference(gpr): in_axis, name, level, gpr = False, None, gpr, None else: raise KeyError(gpr) elif isinstance(gpr, Grouper) and gpr.key is not None: # Add key to exclusions exclusions.append(gpr.key) in_axis, name = False, None else: in_axis, name = False, None if is_categorical_dtype(gpr) and len(gpr) != obj.shape[axis]: raise ValueError( ("Length of grouper ({len_gpr}) and axis ({len_axis})" " must be same length".format(len_gpr=len(gpr), len_axis=obj.shape[axis]))) # create the Grouping # allow us to passing the actual Grouping as the gpr ping = Grouping(group_axis, gpr, obj=obj, name=name, level=level, sort=sort, observed=observed, in_axis=in_axis) \ if not isinstance(gpr, Grouping) else gpr groupings.append(ping) if len(groupings) == 0: raise ValueError('No group keys passed!') # create the internals grouper grouper = BaseGrouper(group_axis, groupings, sort=sort, mutated=mutated) return grouper, exclusions, obj
def to_tuples(self, na_tuple=True): tuples = com._asarray_tuplesafe(zip(self.left, self.right)) if not na_tuple: # GH 18756 tuples = np.where(~self.isna(), tuples, np.nan) return tuples
def _check_groupby(df, result, keys, field, f=lambda x: x.sum()): tups = lmap(tuple, df[keys].values) tups = com._asarray_tuplesafe(tups) expected = f(df.groupby(tups)[field]) for k, v in compat.iteritems(expected): assert (result[k] == v)
def _convert_arr_indexer(self, keyarr): keyarr = _asarray_tuplesafe(keyarr) return self._shallow_copy(keyarr)
def _read_panel_table(self, group, where=None): table = getattr(group, 'table') fields = table._v_attrs.fields # create the selection sel = Selection(table, where, table._v_attrs.index_kind) sel.select() fields = table._v_attrs.fields columns = _maybe_convert(sel.values['column'], table._v_attrs.columns_kind) index = _maybe_convert(sel.values['index'], table._v_attrs.index_kind) values = sel.values['values'] major = Factor.from_array(index) minor = Factor.from_array(columns) J, K = len(major.levels), len(minor.levels) key = major.labels * K + minor.labels if len(unique(key)) == len(key): sorter, _ = lib.groupsort_indexer(com._ensure_int64(key), J * K) sorter = com._ensure_platform_int(sorter) # the data need to be sorted sorted_values = values.take(sorter, axis=0) major_labels = major.labels.take(sorter) minor_labels = minor.labels.take(sorter) block = block2d_to_block3d(sorted_values, fields, (J, K), major_labels, minor_labels) mgr = BlockManager([block], [block.ref_items, major.levels, minor.levels]) wp = Panel(mgr) else: if not self._quiet: # pragma: no cover print( 'Duplicate entries in table, taking most recently ' 'appended') # reconstruct long_index = MultiIndex.from_arrays([index, columns]) lp = DataFrame(values, index=long_index, columns=fields) # need a better algorithm tuple_index = long_index._tuple_index unique_tuples = lib.fast_unique(tuple_index) unique_tuples = _asarray_tuplesafe(unique_tuples) indexer = match(unique_tuples, tuple_index) indexer = com._ensure_platform_int(indexer) new_index = long_index.take(indexer) new_values = lp.values.take(indexer, axis=0) lp = DataFrame(new_values, index=new_index, columns=lp.columns) wp = lp.to_panel() if sel.column_filter: new_minor = sorted(set(wp.minor_axis) & sel.column_filter) wp = wp.reindex(minor=new_minor) return wp
def _convert_to_indexer(self, obj, axis=0): """ Convert indexing key into something we can use to do actual fancy indexing on an ndarray Examples ix[:5] -> slice(0, 5) ix[[1,2,3]] -> [1,2,3] ix[['foo', 'bar', 'baz']] -> [i, j, k] (indices of foo, bar, baz) Going by Zen of Python? "In the face of ambiguity, refuse the temptation to guess." raise AmbiguousIndexError with integer labels? - No, prefer label-based indexing """ labels = self.obj._get_axis(axis) is_int_index = _is_integer_index(labels) if com.is_integer(obj) and not is_int_index: return obj try: return labels.get_loc(obj) except (KeyError, TypeError): pass if isinstance(obj, slice): ltype = labels.inferred_type # in case of providing all floats, use label-based indexing float_slice = (labels.inferred_type == 'floating' and _is_float_slice(obj)) # floats that are within tolerance of int used as positions int_slice = _is_index_slice(obj) null_slice = obj.start is None and obj.stop is None # could have integers in the first level of the MultiIndex, # in which case we wouldn't want to do position-based slicing position_slice = (int_slice and not ltype == 'integer' and not isinstance(labels, MultiIndex) and not float_slice) start, stop = obj.start, obj.stop # last ditch effort: if we are mixed and have integers try: if position_slice and 'mixed' in ltype: if start is not None: i = labels.get_loc(start) if stop is not None: j = labels.get_loc(stop) position_slice = False except KeyError: if ltype == 'mixed-integer-float': raise if null_slice or position_slice: indexer = obj else: try: indexer = labels.slice_indexer(start, stop, obj.step) except Exception: if _is_index_slice(obj): if ltype == 'integer': raise indexer = obj else: raise return indexer elif _is_list_like(obj): if com._is_bool_indexer(obj): obj = _check_bool_indexer(labels, obj) inds, = obj.nonzero() return inds else: if isinstance(obj, Index): objarr = obj.values else: objarr = _asarray_tuplesafe(obj) # If have integer labels, defer to label-based indexing if _is_integer_dtype(objarr) and not is_int_index: if labels.inferred_type != 'integer': objarr = np.where(objarr < 0, len(labels) + objarr, objarr) return objarr # this is not the most robust, but... if (isinstance(labels, MultiIndex) and not isinstance(objarr[0], tuple)): level = 0 _, indexer = labels.reindex(objarr, level=level) check = labels.levels[0].get_indexer(objarr) else: level = None # unique index if labels.is_unique: indexer = check = labels.get_indexer(objarr) # non-unique (dups) else: indexer, missing = labels.get_indexer_non_unique(objarr) check = indexer mask = check == -1 if mask.any(): raise KeyError('%s not in index' % objarr[mask]) return indexer else: return labels.get_loc(obj)
def rolling_window(arg, window=None, win_type=None, min_periods=None, freq=None, center=False, mean=True, time_rule=None, axis=0, **kwargs): """ Applies a moving window of type ``window_type`` and size ``window`` on the data. Parameters ---------- arg : Series, DataFrame window : int or ndarray Weighting window specification. If the window is an integer, then it is treated as the window length and win_type is required win_type : str, default None Window type (see Notes) min_periods : int Minimum number of observations in window required to have a value. freq : None or string alias / date offset object, default=None Frequency to conform to before computing statistic center : boolean, default False Whether the label should correspond with center of window mean : boolean, default True If True computes weighted mean, else weighted sum Returns ------- y : type of input argument Notes ----- The recognized window types are: * ``boxcar`` * ``triang`` * ``blackman`` * ``hamming`` * ``bartlett`` * ``parzen`` * ``bohman`` * ``blackmanharris`` * ``nuttall`` * ``barthann`` * ``kaiser`` (needs beta) * ``gaussian`` (needs std) * ``general_gaussian`` (needs power, width) * ``slepian`` (needs width). """ if isinstance(window, (list, tuple, np.ndarray)): if win_type is not None: raise ValueError(('Do not specify window type if using custom ' 'weights')) window = com._asarray_tuplesafe(window).astype(float) elif com.is_integer(window): # window size if win_type is None: raise ValueError('Must specify window type') try: import scipy.signal as sig except ImportError: raise ImportError('Please install scipy to generate window weight') win_type = _validate_win_type(win_type, kwargs) # may pop from kwargs window = sig.get_window(win_type, window).astype(float) else: raise ValueError('Invalid window %s' % str(window)) minp = _use_window(min_periods, len(window)) arg = _conv_timerule(arg, freq, time_rule) return_hook, values = _process_data_structure(arg) f = lambda x: algos.roll_window(x, window, minp, avg=mean) result = np.apply_along_axis(f, axis, values) rs = return_hook(result) if center: rs = _center_window(rs, len(window), axis) return rs
def _convert_to_indexer(self, obj, axis=0): """ Convert indexing key into something we can use to do actual fancy indexing on an ndarray Examples ix[:5] -> slice(0, 5) ix[[1,2,3]] -> [1,2,3] ix[['foo', 'bar', 'baz']] -> [i, j, k] (indices of foo, bar, baz) Going by Zen of Python? "In the face of ambiguity, refuse the temptation to guess." raise AmbiguousIndexError with integer labels? - No, prefer label-based indexing """ labels = self.obj._get_axis(axis) is_int_index = _is_integer_index(labels) if com.is_integer(obj) and not is_int_index: return obj try: return labels.get_loc(obj) except (KeyError, TypeError): pass if isinstance(obj, slice): int_slice = _is_index_slice(obj) null_slice = obj.start is None and obj.stop is None # could have integers in the first level of the MultiIndex position_slice = (int_slice and not labels.inferred_type == 'integer' and not isinstance(labels, MultiIndex)) start, stop = obj.start, obj.stop # last ditch effort: if we are mixed and have integers try: if 'mixed' in labels.inferred_type and int_slice: if start is not None: i = labels.get_loc(start) if stop is not None: j = labels.get_loc(stop) position_slice = False except KeyError: if labels.inferred_type == 'mixed-integer': raise if null_slice or position_slice: slicer = obj else: try: i, j = labels.slice_locs(start, stop) slicer = slice(i, j, obj.step) except Exception: if _is_index_slice(obj): if labels.inferred_type == 'integer': raise slicer = obj else: raise return slicer elif _is_list_like(obj): if com._is_bool_indexer(obj): objarr = _check_bool_indexer(labels, obj) return objarr else: objarr = _asarray_tuplesafe(obj) # If have integer labels, defer to label-based indexing if _is_integer_dtype(objarr) and not is_int_index: return objarr # this is not the most robust, but... if (isinstance(labels, MultiIndex) and not isinstance(objarr[0], tuple)): level = 0 _, indexer = labels.reindex(objarr, level=level) check = labels.levels[0].get_indexer(objarr) else: level = None indexer = check = labels.get_indexer(objarr) mask = check == -1 if mask.any(): raise KeyError('%s not in index' % objarr[mask]) return indexer else: return labels.get_loc(obj)
def to_tuples(self): """Return an Index of tuples of the form (left, right)""" return Index(_asarray_tuplesafe(zip(self.left, self.right)))
def to_tuples(self): return Index(_asarray_tuplesafe(zip(self.left, self.right)))
def _get_grouper(obj, key=None, axis=0, level=None, sort=True): """ create and return a BaseGrouper, which is an internal mapping of how to create the grouper indexers. This may be composed of multiple Grouping objects, indicating multiple groupers Groupers are ultimately index mappings. They can originate as: index mappings, keys to columns, functions, or Groupers Groupers enable local references to axis,level,sort, while the passed in axis, level, and sort are 'global'. This routine tries to figure of what the passing in references are and then creates a Grouping for each one, combined into a BaseGrouper. """ # The implementation is essentially the same as pandas.core.groupby group_axis = obj._get_axis(axis) # validate thatthe passed level is compatible with the passed # axis of the object if level is not None: if not isinstance(group_axis, MultiIndex): if isinstance(level, compat.string_types): if obj.index.name != level: raise ValueError('level name %s is not the name of the ' 'index' % level) elif level > 0: raise ValueError('level > 0 only valid with MultiIndex') level = None key = group_axis # a passed in Grouper, directly convert if isinstance(key, Grouper): binner, grouper, obj = key._get_grouper(obj) if key.key is None: return grouper, [], obj else: return grouper, set([key.key]), obj # already have a BaseGrouper, just return it elif isinstance(key, BaseGrouper): return key, [], obj if not isinstance(key, (tuple, list)): keys = [key] else: keys = key # what are we after, exactly? match_axis_length = len(keys) == len(group_axis) any_callable = any(callable(g) or isinstance(g, dict) for g in keys) any_arraylike = any( isinstance(g, (list, tuple, Series, Index, np.ndarray)) for g in keys) try: if isinstance(obj, DataFrame): all_in_columns = all(g in obj.columns for g in keys) else: all_in_columns = False except Exception: all_in_columns = False if (not any_callable and not all_in_columns and not any_arraylike and match_axis_length and level is None): keys = [com._asarray_tuplesafe(keys)] if isinstance(level, (tuple, list)): if key is None: keys = [None] * len(level) levels = level else: levels = [level] * len(keys) groupings = [] exclusions = [] # if the actual grouper should be obj[key] def is_in_axis(key): if not _is_label_like(key): try: obj._data.items.get_loc(key) except Exception: return False return True # if the the grouper is obj[name] def is_in_obj(gpr): try: return id(gpr) == id(obj[gpr.name]) except Exception: return False for i, (gpr, level) in enumerate(zip(keys, levels)): if is_in_obj(gpr): # df.groupby(df['name']) in_axis, name = True, gpr.name exclusions.append(name) elif is_in_axis(gpr): # df.groupby('name') in_axis, name, gpr = True, gpr, obj[gpr] exclusions.append(name) else: in_axis, name = False, None if com.is_categorical_dtype(gpr) and len(gpr) != len(obj): raise ValueError( "Categorical dtype grouper must have len(grouper) == len(data)" ) ping = Grouping(group_axis, gpr, obj=obj, name=name, level=level, sort=sort, in_axis=in_axis) groupings.append(ping) if len(groupings) == 0: raise ValueError('No group keys passed!') # create the internals grouper # Modified to insert CustomGrouper grouper = CustomGrouper(group_axis, groupings, sort=sort) return grouper, exclusions, obj
def test_int64_overflow(self): B = np.concatenate((np.arange(1000), np.arange(1000), np.arange(500))) A = np.arange(2500) df = DataFrame({'A': A, 'B': B, 'C': A, 'D': B, 'E': A, 'F': B, 'G': A, 'H': B, 'values': np.random.randn(2500)}) lg = df.groupby(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']) rg = df.groupby(['H', 'G', 'F', 'E', 'D', 'C', 'B', 'A']) left = lg.sum()['values'] right = rg.sum()['values'] exp_index, _ = left.index.sortlevel() self.assert_index_equal(left.index, exp_index) exp_index, _ = right.index.sortlevel(0) self.assert_index_equal(right.index, exp_index) tups = list(map(tuple, df[['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H' ]].values)) tups = com._asarray_tuplesafe(tups) expected = df.groupby(tups).sum()['values'] for k, v in compat.iteritems(expected): self.assertEqual(left[k], right[k[::-1]]) self.assertEqual(left[k], v) self.assertEqual(len(left), len(right)) # GH9096 values = range(55109) data = pd.DataFrame.from_dict({'a': values, 'b': values, 'c': values, 'd': values}) grouped = data.groupby(['a', 'b', 'c', 'd']) self.assertEqual(len(grouped), len(values)) arr = np.random.randint(-1 << 12, 1 << 12, (1 << 15, 5)) i = np.random.choice(len(arr), len(arr) * 4) arr = np.vstack((arr, arr[i])) # add sume duplicate rows i = np.random.permutation(len(arr)) arr = arr[i] # shuffle rows df = DataFrame(arr, columns=list('abcde')) df['jim'], df['joe'] = np.random.randn(2, len(df)) * 10 gr = df.groupby(list('abcde')) # verify this is testing what it is supposed to test! self.assertTrue(is_int64_overflow_possible(gr.grouper.shape)) # mannually compute groupings jim, joe = defaultdict(list), defaultdict(list) for key, a, b in zip(map(tuple, arr), df['jim'], df['joe']): jim[key].append(a) joe[key].append(b) self.assertEqual(len(gr), len(jim)) mi = MultiIndex.from_tuples(jim.keys(), names=list('abcde')) def aggr(func): f = lambda a: np.fromiter(map(func, a), dtype='f8') arr = np.vstack((f(jim.values()), f(joe.values()))).T res = DataFrame(arr, columns=['jim', 'joe'], index=mi) return res.sort_index() assert_frame_equal(gr.mean(), aggr(np.mean)) assert_frame_equal(gr.median(), aggr(np.median))