def _wrap_aggregated_output(self, output, mask): # sort of a kludge output = output[self.name] index = self._get_multi_index(mask) return Series(output, index=index)
def _generate_marginal_results(table, data, values, rows, cols, aggfunc, observed, grand_margin, margins_name="All"): if len(cols) > 0: # need to "interleave" the margins table_pieces = [] margin_keys = [] def _all_key(key): return (key, margins_name) + ("", ) * (len(cols) - 1) if len(rows) > 0: margin = data[rows + values].groupby( rows, observed=observed).agg(aggfunc) cat_axis = 1 for key, piece in table.groupby(level=0, axis=cat_axis, observed=observed): all_key = _all_key(key) # we are going to mutate this, so need to copy! piece = piece.copy() try: piece[all_key] = margin[key] except TypeError: # we cannot reshape, so coerce the axis piece.set_axis( piece._get_axis(cat_axis)._to_safe_for_reshape(), axis=cat_axis, inplace=True, ) piece[all_key] = margin[key] table_pieces.append(piece) margin_keys.append(all_key) else: margin = grand_margin cat_axis = 0 for key, piece in table.groupby(level=0, axis=cat_axis, observed=observed): all_key = _all_key(key) table_pieces.append(piece) table_pieces.append(Series(margin[key], index=[all_key])) margin_keys.append(all_key) result = concat(table_pieces, axis=cat_axis) if len(rows) == 0: return result else: result = table margin_keys = table.columns if len(cols) > 0: row_margin = data[cols + values].groupby( cols, observed=observed).agg(aggfunc) row_margin = row_margin.stack() # slight hack new_order = [len(cols)] + list(range(len(cols))) row_margin.index = row_margin.index.reorder_levels(new_order) else: row_margin = Series(np.nan, index=result.columns) return result, margin_keys, row_margin
def _get_empty_meta(self, columns, index_col, index_names, dtype: DtypeArg | None = None): columns = list(columns) # Convert `dtype` to a defaultdict of some kind. # This will enable us to write `dtype[col_name]` # without worrying about KeyError issues later on. if not is_dict_like(dtype): # if dtype == None, default will be object. default_dtype = dtype or object # error: Argument 1 to "defaultdict" has incompatible type "Callable[[], # Union[ExtensionDtype, str, dtype[Any], Type[object], Dict[Hashable, # Union[ExtensionDtype, Union[str, dtype[Any]], Type[str], Type[float], # Type[int], Type[complex], Type[bool], Type[object]]]]]"; expected # "Optional[Callable[[], Union[ExtensionDtype, str, dtype[Any], # Type[object]]]]" # error: Incompatible return value type (got "Union[ExtensionDtype, str, # dtype[Any], Type[object], Dict[Hashable, Union[ExtensionDtype, Union[str, # dtype[Any]], Type[str], Type[float], Type[int], Type[complex], Type[bool], # Type[object]]]]", expected "Union[ExtensionDtype, str, dtype[Any], # Type[object]]") dtype = defaultdict( lambda: default_dtype # type: ignore[arg-type, return-value] ) else: dtype = cast(dict, dtype) dtype = defaultdict( lambda: object, { columns[k] if is_integer(k) else k: v for k, v in dtype.items() }, ) # Even though we have no data, the "index" of the empty DataFrame # could for example still be an empty MultiIndex. Thus, we need to # check whether we have any index columns specified, via either: # # 1) index_col (column indices) # 2) index_names (column names) # # Both must be non-null to ensure a successful construction. Otherwise, # we have to create a generic empty Index. if (index_col is None or index_col is False) or index_names is None: index = Index([]) else: data = [Series([], dtype=dtype[name]) for name in index_names] index = ensure_index_from_sequences(data, names=index_names) index_col.sort() for i, n in enumerate(index_col): columns.pop(n - i) col_dict = { col_name: Series([], dtype=dtype[col_name]) for col_name in columns } return index, columns, col_dict
def __new__(cls, data, index=None, sparse_index=None, kind='block', fill_value=None, name=None, copy=False): is_sparse_array = isinstance(data, SparseArray) if fill_value is None: if is_sparse_array: fill_value = data.fill_value else: fill_value = nan if is_sparse_array: if isinstance(data, SparseSeries) and index is None: index = data.index elif index is not None: assert (len(index) == len(data)) sparse_index = data.sp_index values = np.asarray(data) elif isinstance(data, (Series, dict)): if index is None: index = data.index data = Series(data) values, sparse_index = make_sparse(data, kind=kind, fill_value=fill_value) elif np.isscalar(data): # pragma: no cover if index is None: raise Exception('must pass index!') values = np.empty(len(index)) values.fill(data) # TODO: more efficient values, sparse_index = make_sparse(values, kind=kind, fill_value=fill_value) else: # array-like if sparse_index is None: values, sparse_index = make_sparse(data, kind=kind, fill_value=fill_value) else: values = data assert (len(values) == sparse_index.npoints) if index is None: index = Index(np.arange(sparse_index.length)) index = _ensure_index(index) # Create array, do *not* copy data by default if copy: subarr = np.array(values, dtype=np.float64, copy=True) else: subarr = np.asarray(values, dtype=np.float64) if index.is_all_dates(): cls = SparseTimeSeries # Change the class of the array to be the subclass type. output = subarr.view(cls) output._sp_values = subarr output.sp_index = sparse_index output.fill_value = np.float64(fill_value) output.index = index output.name = name return output
def _initDict(self, data, index, columns, objects, dtype): """ Segregate Series based on type and coerce into matrices. Needs to handle a lot of exceptional cases. Somehow this got outrageously complicated """ # pre-filter out columns if we passed it if columns is not None: colset = set(columns) data = dict((k, v) for k, v in data.iteritems() if k in colset) index = _extract_index(data, index) objectDict = {} if objects is not None and isinstance(objects, dict): objectDict.update(objects) valueDict = {} for k, v in data.iteritems(): if isinstance(v, Series): if v.index is not index: # Forces alignment. No need to copy data since we # are putting it into an ndarray later v = v.reindex(index) else: if isinstance(v, dict): v = [v.get(i, NaN) for i in index] else: assert (len(v) == len(index)) try: v = Series(v, dtype=dtype, index=index) except Exception: v = Series(v, index=index) if issubclass(v.dtype.type, (np.bool_, float, int)): valueDict[k] = v else: objectDict[k] = v if columns is None: columns = Index(_try_sort(valueDict)) objectColumns = Index(_try_sort(objectDict)) else: objectColumns = Index([c for c in columns if c in objectDict]) columns = Index([c for c in columns if c not in objectDict]) if len(valueDict) == 0: dtype = np.object_ valueDict = objectDict columns = objectColumns else: dtypes = set(v.dtype for v in valueDict.values()) if len(dtypes) > 1: dtype = np.float_ else: dtype = list(dtypes)[0] if len(objectDict) > 0: new_objects = DataMatrix(objectDict, dtype=np.object_, index=index, columns=objectColumns) if isinstance(objects, DataMatrix): objects = objects.join(new_objects, how='left') else: objects = new_objects values = np.empty((len(index), len(columns)), dtype=dtype) for i, col in enumerate(columns): if col in valueDict: values[:, i] = valueDict[col] else: values[:, i] = np.NaN return index, columns, values, objects
def beta(self): return Series(self._beta_raw, index=self._x.columns)
def _generate_marginal_results( table, data, values, rows, cols, aggfunc, observed, margins_name: str = "All", ): if len(cols) > 0: # need to "interleave" the margins table_pieces = [] margin_keys = [] def _all_key(key): return (key, margins_name) + ("", ) * (len(cols) - 1) if len(rows) > 0: margin = data[rows + values].groupby( rows, observed=observed).agg(aggfunc) cat_axis = 1 for key, piece in table.groupby(level=0, axis=cat_axis, observed=observed): all_key = _all_key(key) # we are going to mutate this, so need to copy! piece = piece.copy() try: piece[all_key] = margin[key] except TypeError: # we cannot reshape, so coerce the axis piece.set_axis( piece._get_axis(cat_axis)._to_safe_for_reshape(), axis=cat_axis, inplace=True, ) piece[all_key] = margin[key] table_pieces.append(piece) margin_keys.append(all_key) else: from pandas import DataFrame cat_axis = 0 for key, piece in table.groupby(level=0, axis=cat_axis, observed=observed): if len(cols) > 1: all_key = _all_key(key) else: all_key = margins_name table_pieces.append(piece) # GH31016 this is to calculate margin for each group, and assign # corresponded key as index transformed_piece = DataFrame(piece.apply(aggfunc)).T transformed_piece.index = Index([all_key], name=piece.index.name) # append piece for margin into table_piece table_pieces.append(transformed_piece) margin_keys.append(all_key) result = concat(table_pieces, axis=cat_axis) if len(rows) == 0: return result else: result = table margin_keys = table.columns if len(cols) > 0: row_margin = data[cols + values].groupby( cols, observed=observed).agg(aggfunc) row_margin = row_margin.stack() # slight hack new_order = [len(cols)] + list(range(len(cols))) row_margin.index = row_margin.index.reorder_levels(new_order) else: row_margin = Series(np.nan, index=result.columns) return result, margin_keys, row_margin
def __new__(cls, data, index=None, sparse_index=None, kind='block', fill_value=None, name=None, copy=False): is_sparse_array = isinstance(data, SparseArray) if fill_value is None: if is_sparse_array: fill_value = data.fill_value else: fill_value = nan if is_sparse_array: if isinstance(data, SparseSeries) and index is None: index = data.index elif index is not None: assert (len(index) == len(data)) sparse_index = data.sp_index values = np.asarray(data) elif isinstance(data, (Series, dict)): if index is None: index = data.index data = Series(data) values, sparse_index = make_sparse(data, kind=kind, fill_value=fill_value) elif isinstance(data, (tuple, list, np.ndarray)): # array-like if sparse_index is None: values, sparse_index = make_sparse(data, kind=kind, fill_value=fill_value) else: values = data assert (len(values) == sparse_index.npoints) else: if index is None: raise Exception('must pass index!') length = len(index) if data == fill_value or (isnull(data) and isnull(fill_value)): if kind == 'block': sparse_index = BlockIndex(length, [], []) else: sparse_index = IntIndex(length, []) values = np.array([]) else: if kind == 'block': locs, lens = ([0], [length]) if length else ([], []) sparse_index = BlockIndex(length, locs, lens) else: sparse_index = IntIndex(length, index) values = np.empty(length) values.fill(data) if index is None: index = com._default_index(sparse_index.length) index = _ensure_index(index) # Create array, do *not* copy data by default if copy: subarr = np.array(values, dtype=np.float64, copy=True) else: subarr = np.asarray(values, dtype=np.float64) if index.is_all_dates: cls = SparseTimeSeries # Change the class of the array to be the subclass type. output = subarr.view(cls) output.sp_index = sparse_index output.fill_value = np.float64(fill_value) output.index = index output.name = name return output
def selectorOfRouletteWheelRatedItem(votesOfCandidatesDict: dict): votesOfCandidatesSer: Series = Series( votesOfCandidatesDict, index=votesOfCandidatesDict.keys()) return RouletteWheelSelector.run(votesOfCandidatesSer)
def __init__(self, data=None, index=None, sparse_index=None, kind='block', fill_value=None, name=None, dtype=None, copy=False, fastpath=False): # we are called internally, so short-circuit if fastpath: # data is an ndarray, index is defined if not isinstance(data, SingleBlockManager): data = SingleBlockManager(data, index, fastpath=True) if copy: data = data.copy() else: if data is None: data = [] if isinstance(data, Series) and name is None: name = data.name if isinstance(data, SparseArray): if index is not None: assert (len(index) == len(data)) sparse_index = data.sp_index if fill_value is None: fill_value = data.fill_value data = np.asarray(data) elif isinstance(data, SparseSeries): if index is None: index = data.index.view() if fill_value is None: fill_value = data.fill_value # extract the SingleBlockManager data = data._data elif isinstance(data, (Series, dict)): data = Series(data, index=index) index = data.index.view() res = make_sparse(data, kind=kind, fill_value=fill_value) data, sparse_index, fill_value = res elif isinstance(data, (tuple, list, np.ndarray)): # array-like if sparse_index is None: res = make_sparse(data, kind=kind, fill_value=fill_value) data, sparse_index, fill_value = res else: assert (len(data) == sparse_index.npoints) elif isinstance(data, SingleBlockManager): if dtype is not None: data = data.astype(dtype) if index is None: index = data.index.view() elif not data.index.equals(index) or copy: # pragma: no cover # GH#19275 SingleBlockManager input should only be called # internally raise AssertionError('Cannot pass both SingleBlockManager ' '`data` argument and a different ' '`index` argument. `copy` must ' 'be False.') else: length = len(index) if data == fill_value or (isna(data) and isna(fill_value)): if kind == 'block': sparse_index = BlockIndex(length, [], []) else: sparse_index = IntIndex(length, []) data = np.array([]) else: if kind == 'block': locs, lens = ([0], [length]) if length else ([], []) sparse_index = BlockIndex(length, locs, lens) else: sparse_index = IntIndex(length, index) v = data data = np.empty(length) data.fill(v) if index is None: index = com._default_index(sparse_index.length) index = _ensure_index(index) # create/copy the manager if isinstance(data, SingleBlockManager): if copy: data = data.copy() else: # create a sparse array if not isinstance(data, SparseArray): data = SparseArray(data, sparse_index=sparse_index, fill_value=fill_value, dtype=dtype, copy=copy) data = SingleBlockManager(data, index) generic.NDFrame.__init__(self, data) self.index = index self.name = name
def value_counts(values, sort=True, ascending=False, normalize=False, bins=None): """ Compute a histogram of the counts of non-null values Parameters ---------- values : ndarray (1-d) sort : boolean, default True Sort by values ascending : boolean, default False Sort in ascending order normalize: boolean, default False If True then compute a relative histogram bins : integer, optional Rather than count values, group them into half-open bins, convenience for pd.cut, only works with numeric data Returns ------- value_counts : Series """ from pandas.core.series import Series from pandas.tools.tile import cut values = Series(values).values if bins is not None: try: cat, bins = cut(values, bins, retbins=True) except TypeError: raise TypeError("bins argument only works with numeric data.") values = cat.labels if com.is_integer_dtype(values.dtype): values = com._ensure_int64(values) keys, counts = htable.value_count_int64(values) elif issubclass(values.dtype.type, (np.datetime64, np.timedelta64)): dtype = values.dtype values = values.view(np.int64) keys, counts = htable.value_count_int64(values) # convert the keys back to the dtype we came in keys = Series(keys, dtype=dtype) else: mask = com.isnull(values) values = com._ensure_object(values) keys, counts = htable.value_count_object(values, mask) result = Series(counts, index=com._values_from_object(keys)) if bins is not None: # TODO: This next line should be more efficient result = result.reindex(np.arange(len(cat.levels)), fill_value=0) result.index = bins[:-1] if sort: result.sort() if not ascending: result = result[::-1] if normalize: result = result / float(values.size) return result
def setup_method(self) -> None: self.data_backend = PandasBackend( DataFrame({"a": ["a", "b", "c"], "b": [1, 2, 3], "c": [True, False, True]}) ) self.test_series0 = PandasBackend(Series({"a": "a", "b": 1, "c": True}), index=[0]) self.test_series2 = PandasBackend(Series({"a": "c", "b": 3, "c": True}), index=[2])
def setUp(self): """ Setup the dataframes used for the groupby tests derived from pandas """ self.dateRange = bdate_range('1/1/2005', periods=250) self.stringIndex = Index([rands(8).upper() for x in range(250)]) self.groupId = Series([x[0] for x in self.stringIndex], index=self.stringIndex) self.groupDict = dict((k, v) for k, v in compat.iteritems(self.groupId)) self.columnIndex = Index(['A', 'B', 'C', 'D', 'E']) randMat = np.random.randn(250, 5) self.stringMatrix = DataFrame(randMat, columns=self.columnIndex, index=self.stringIndex) self.timeMatrix = DataFrame(randMat, columns=self.columnIndex, index=self.dateRange) self.ts = tm.makeTimeSeries() self.seriesd = tm.getSeriesData() self.tsd = tm.getTimeSeriesData() self.frame = DataFrame(self.seriesd) self.tsframe = DataFrame(self.tsd) self.df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], 'C': np.random.randn(8), 'D': np.random.randn(8)}) self.df_mixed_floats = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], 'C': np.random.randn(8), 'D': np.array(np.random.randn(8), dtype='float32')}) index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', 'three']], labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=['first', 'second']) self.mframe = DataFrame(np.random.randn(10, 3), index=index, columns=['A', 'B', 'C']) self.three_group = DataFrame({'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar', 'foo', 'foo', 'foo'], 'B': ['one', 'one', 'one', 'two', 'one', 'one', 'one', 'two', 'two', 'two', 'one'], 'C': ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny', 'dull', 'shiny', 'shiny', 'shiny'], 'D': np.random.randn(11), 'E': np.random.randn(11), 'F': np.random.randn(11)}) super(self.__class__, self).setUp()
result = DatetimeIndex(result, tz='utc' if utc else None) return result except ValueError, e: try: values, tz = lib.datetime_to_datetime64(arg) return DatetimeIndex._simple_new(values, None, tz=tz) except (ValueError, TypeError): raise e if arg is None: return arg elif isinstance(arg, datetime): return arg elif isinstance(arg, Series): values = _convert_f(arg.values) return Series(values, index=arg.index, name=arg.name) elif isinstance(arg, (np.ndarray, list)): if isinstance(arg, list): arg = np.array(arg, dtype='O') result = _convert_f(arg) return result try: if not arg: return arg return parse(arg, dayfirst=dayfirst) except Exception: if errors == 'raise': raise return arg
def _unstack_multiple(data, clocs): if len(clocs) == 0: return data # NOTE: This doesn't deal with hierarchical columns yet index = data.index clocs = [index._get_level_number(i) for i in clocs] rlocs = [i for i in range(index.nlevels) if i not in clocs] clevels = [index.levels[i] for i in clocs] clabels = [index.labels[i] for i in clocs] cnames = [index.names[i] for i in clocs] rlevels = [index.levels[i] for i in rlocs] rlabels = [index.labels[i] for i in rlocs] rnames = [index.names[i] for i in rlocs] shape = [len(x) for x in clevels] group_index = get_group_index(clabels, shape) comp_ids, obs_ids = _compress_group_index(group_index, sort=False) recons_labels = decons_group_index(obs_ids, shape) dummy_index = MultiIndex(levels=rlevels + [obs_ids], labels=rlabels + [comp_ids], names=rnames + ['__placeholder__']) if isinstance(data, Series): dummy = Series(data.values, index=dummy_index) unstacked = dummy.unstack('__placeholder__') new_levels = clevels new_names = cnames new_labels = recons_labels else: if isinstance(data.columns, MultiIndex): result = data for i in range(len(clocs)): val = clocs[i] result = result.unstack(val) clocs = [val if i > val else val - 1 for val in clocs] return result dummy = DataFrame(data.values, index=dummy_index, columns=data.columns) unstacked = dummy.unstack('__placeholder__') if isinstance(unstacked, Series): unstcols = unstacked.index else: unstcols = unstacked.columns new_levels = [unstcols.levels[0]] + clevels new_names = [data.columns.name] + cnames new_labels = [unstcols.labels[0]] for rec in recons_labels: new_labels.append(rec.take(unstcols.labels[-1])) new_columns = MultiIndex(levels=new_levels, labels=new_labels, names=new_names) if isinstance(unstacked, Series): unstacked.index = new_columns else: unstacked.columns = new_columns return unstacked
def test_groupby_multi_categorical_as_index(self): # GH13204 df = DataFrame({ 'cat': Categorical([1, 2, 2], [1, 2, 3]), 'A': [10, 11, 11], 'B': [101, 102, 103] }) result = df.groupby(['cat', 'A'], as_index=False).sum() expected = DataFrame( { 'cat': Categorical([1, 1, 2, 2, 3, 3]), 'A': [10, 11, 10, 11, 10, 11], 'B': [101.0, nan, nan, 205.0, nan, nan] }, columns=['cat', 'A', 'B']) tm.assert_frame_equal(result, expected) # function grouper f = lambda r: df.loc[r, 'A'] result = df.groupby(['cat', f], as_index=False).sum() expected = DataFrame( { 'cat': Categorical([1, 1, 2, 2, 3, 3]), 'A': [10.0, nan, nan, 22.0, nan, nan], 'B': [101.0, nan, nan, 205.0, nan, nan] }, columns=['cat', 'A', 'B']) tm.assert_frame_equal(result, expected) # another not in-axis grouper (conflicting names in index) s = Series(['a', 'b', 'b'], name='cat') result = df.groupby(['cat', s], as_index=False).sum() expected = DataFrame( { 'cat': Categorical([1, 1, 2, 2, 3, 3]), 'A': [10.0, nan, nan, 22.0, nan, nan], 'B': [101.0, nan, nan, 205.0, nan, nan] }, columns=['cat', 'A', 'B']) tm.assert_frame_equal(result, expected) # is original index dropped? expected = DataFrame( { 'cat': Categorical([1, 1, 2, 2, 3, 3]), 'A': [10, 11, 10, 11, 10, 11], 'B': [101.0, nan, nan, 205.0, nan, nan] }, columns=['cat', 'A', 'B']) group_columns = ['cat', 'A'] for name in [None, 'X', 'B', 'cat']: df.index = Index(list("abc"), name=name) if name in group_columns and name in df.index.names: with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = df.groupby(group_columns, as_index=False).sum() else: result = df.groupby(group_columns, as_index=False).sum() tm.assert_frame_equal(result, expected, check_index_type=True)
def value_counts(values, sort=True, ascending=False, normalize=False, bins=None, dropna=True): """ Compute a histogram of the counts of non-null values. Parameters ---------- values : ndarray (1-d) sort : boolean, default True Sort by values ascending : boolean, default False Sort in ascending order normalize: boolean, default False If True then compute a relative histogram bins : integer, optional Rather than count values, group them into half-open bins, convenience for pd.cut, only works with numeric data dropna : boolean, default True Don't include counts of NaN Returns ------- value_counts : Series """ from pandas.core.series import Series from pandas.tools.tile import cut from pandas import Index, PeriodIndex, DatetimeIndex name = getattr(values, 'name', None) values = Series(values).values if bins is not None: try: cat, bins = cut(values, bins, retbins=True) except TypeError: raise TypeError("bins argument only works with numeric data.") values = cat.codes if com.is_categorical_dtype(values.dtype): result = values.value_counts(dropna) else: dtype = values.dtype is_period = com.is_period_arraylike(values) is_datetimetz = com.is_datetimetz(values) if com.is_datetime_or_timedelta_dtype(dtype) or is_period or \ is_datetimetz: if is_period: values = PeriodIndex(values) elif is_datetimetz: tz = getattr(values, 'tz', None) values = DatetimeIndex(values).tz_localize(None) values = values.view(np.int64) keys, counts = htable.value_count_scalar64(values, dropna) if dropna: from pandas.tslib import iNaT msk = keys != iNaT keys, counts = keys[msk], counts[msk] # localize to the original tz if necessary if is_datetimetz: keys = DatetimeIndex(keys).tz_localize(tz) # convert the keys back to the dtype we came in else: keys = keys.astype(dtype) elif com.is_integer_dtype(dtype): values = com._ensure_int64(values) keys, counts = htable.value_count_scalar64(values, dropna) elif com.is_float_dtype(dtype): values = com._ensure_float64(values) keys, counts = htable.value_count_scalar64(values, dropna) else: values = com._ensure_object(values) mask = com.isnull(values) keys, counts = htable.value_count_object(values, mask) if not dropna and mask.any(): keys = np.insert(keys, 0, np.NaN) counts = np.insert(counts, 0, mask.sum()) if not isinstance(keys, Index): keys = Index(keys) result = Series(counts, index=keys, name=name) if bins is not None: # TODO: This next line should be more efficient result = result.reindex(np.arange(len(cat.categories)), fill_value=0) result.index = bins[:-1] if sort: result = result.sort_values(ascending=ascending) if normalize: result = result / float(values.size) return result
def data(self, convert_dates=True, convert_categoricals=True, index=None): """ Reads observations from Stata file, converting them into a dataframe Parameters ---------- convert_dates : boolean, defaults to True Convert date variables to DataFrame time values convert_categoricals : boolean, defaults to True Read value labels and convert columns to Categorical/Factor variables index : identifier of index column identifier of column that should be used as index of the DataFrame Returns ------- y : DataFrame instance """ if self._data_read: raise Exception("Data has already been read.") self._data_read = True if self.format_version >= 117: self._read_strls() stata_dta = self._dataset() data = [] for rownum, line in enumerate(stata_dta): # doesn't handle missing value objects, just casts # None will only work without missing value object. for i, val in enumerate(line): #NOTE: This will only be scalar types because missing strings # are empty not None in Stata if val is None: line[i] = np.nan data.append(tuple(line)) if convert_categoricals: self._read_value_labels() data = DataFrame(data, columns=self.varlist, index=index) cols_ = np.where(self.dtyplist)[0] for i in cols_: if self.dtyplist[i] is not None: col = data.columns[i] if data[col].dtype is not np.dtype(object): data[col] = Series(data[col], data[col].index, self.dtyplist[i]) if convert_dates: cols = np.where(lmap(lambda x: x in _date_formats, self.fmtlist))[0] for i in cols: col = data.columns[i] data[col] = data[col].apply(_stata_elapsed_date_to_datetime, args=(self.fmtlist[i],)) if convert_categoricals: cols = np.where( lmap(lambda x: x in compat.iterkeys(self.value_label_dict), self.lbllist) )[0] for i in cols: col = data.columns[i] labeled_data = np.copy(data[col]) labeled_data = labeled_data.astype(object) for k, v in compat.iteritems( self.value_label_dict[self.lbllist[i]]): labeled_data[(data[col] == k).values] = v data[col] = Categorical.from_array(labeled_data) return data
def _add_margins( table: Union["Series", "DataFrame"], data, values, rows, cols, aggfunc, observed=None, margins_name: str = "All", fill_value=None, ): if not isinstance(margins_name, str): raise ValueError("margins_name argument must be a string") msg = 'Conflicting name "{name}" in margins'.format(name=margins_name) for level in table.index.names: if margins_name in table.index.get_level_values(level): raise ValueError(msg) grand_margin = _compute_grand_margin(data, values, aggfunc, margins_name) if table.ndim == 2: # i.e. DataFramae for level in table.columns.names[1:]: if margins_name in table.columns.get_level_values(level): raise ValueError(msg) key: Union[str, Tuple[str, ...]] if len(rows) > 1: key = (margins_name, ) + ("", ) * (len(rows) - 1) else: key = margins_name if not values and isinstance(table, ABCSeries): # If there are no values and the table is a series, then there is only # one column in the data. Compute grand margin and return it. return table.append(Series({key: grand_margin[margins_name]})) elif values: marginal_result_set = _generate_marginal_results( table, data, values, rows, cols, aggfunc, observed, margins_name, ) if not isinstance(marginal_result_set, tuple): return marginal_result_set result, margin_keys, row_margin = marginal_result_set else: # no values, and table is a DataFrame assert isinstance(table, ABCDataFrame) marginal_result_set = _generate_marginal_results_without_values( table, data, rows, cols, aggfunc, observed, margins_name) if not isinstance(marginal_result_set, tuple): return marginal_result_set result, margin_keys, row_margin = marginal_result_set row_margin = row_margin.reindex(result.columns, fill_value=fill_value) # populate grand margin for k in margin_keys: if isinstance(k, str): row_margin[k] = grand_margin[k] else: row_margin[k] = grand_margin[k[0]] from pandas import DataFrame margin_dummy = DataFrame(row_margin, columns=[key]).T row_names = result.index.names try: # check the result column and leave floats for dtype in set(result.dtypes): cols = result.select_dtypes([dtype]).columns margin_dummy[cols] = margin_dummy[cols].apply( maybe_downcast_to_dtype, args=(dtype, )) result = result.append(margin_dummy) except TypeError: # we cannot reshape, so coerce the axis result.index = result.index._to_safe_for_reshape() result = result.append(margin_dummy) result.index.names = row_names return result
def to_datetime(arg, errors='ignore', dayfirst=False, utc=None, box=True, format=None, coerce=False, unit='ns'): """ Convert argument to datetime Parameters ---------- arg : string, datetime, array of strings (with possible NAs) errors : {'ignore', 'raise'}, default 'ignore' Errors are ignored by default (values left untouched) dayfirst : boolean, default False If True parses dates with the day first, eg 20/01/2005 Warning: dayfirst=True is not strict, but will prefer to parse with day first (this is a known bug). utc : boolean, default None Return UTC DatetimeIndex if True (converting any tz-aware datetime.datetime objects as well) box : boolean, default True If True returns a DatetimeIndex, if False returns ndarray of values format : string, default None strftime to parse time, eg "%d/%m/%Y" coerce : force errors to NaT (False by default) unit : unit of the arg (D,s,ms,us,ns) denote the unit in epoch (e.g. a unix timestamp), which is an integer/float number Returns ------- ret : datetime if parsing succeeded """ from pandas import Timestamp from pandas.core.series import Series from pandas.tseries.index import DatetimeIndex def _convert_listlike(arg, box): if isinstance(arg, (list, tuple)): arg = np.array(arg, dtype='O') if com.is_datetime64_ns_dtype(arg): if box and not isinstance(arg, DatetimeIndex): try: return DatetimeIndex(arg, tz='utc' if utc else None) except ValueError: pass return arg arg = com._ensure_object(arg) try: if format is not None: result = None # shortcut formatting here if format == '%Y%m%d': try: result = _attempt_YYYYMMDD(arg) except: raise ValueError( "cannot convert the input to '%Y%m%d' date format") # fallback if result is None: result = tslib.array_strptime(arg, format, coerce=coerce) else: result = tslib.array_to_datetime(arg, raise_=errors == 'raise', utc=utc, dayfirst=dayfirst, coerce=coerce, unit=unit) if com.is_datetime64_dtype(result) and box: result = DatetimeIndex(result, tz='utc' if utc else None) return result except ValueError as e: try: values, tz = tslib.datetime_to_datetime64(arg) return DatetimeIndex._simple_new(values, None, tz=tz) except (ValueError, TypeError): raise e if arg is None: return arg elif isinstance(arg, Timestamp): return arg elif isinstance(arg, Series): values = _convert_listlike(arg.values, box=False) return Series(values, index=arg.index, name=arg.name) elif com.is_list_like(arg): return _convert_listlike(arg, box=box) return _convert_listlike(np.array([arg]), box=box)[0]
def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False, sparse=False, drop_first=False): # Series avoids inconsistent NaN handling codes, levels = _factorize_from_iterable(Series(data)) def get_empty_Frame(data, sparse): if isinstance(data, Series): index = data.index else: index = np.arange(len(data)) if not sparse: return DataFrame(index=index) else: return SparseDataFrame(index=index, default_fill_value=0) # if all NaN if not dummy_na and len(levels) == 0: return get_empty_Frame(data, sparse) codes = codes.copy() if dummy_na: codes[codes == -1] = len(levels) levels = np.append(levels, np.nan) # if dummy_na, we just fake a nan level. drop_first will drop it again if drop_first and len(levels) == 1: return get_empty_Frame(data, sparse) number_of_cols = len(levels) if prefix is not None: dummy_strs = [ u'{prefix}{sep}{level}' if isinstance(v, text_type) else '{prefix}{sep}{level}' for v in levels ] dummy_cols = [ dummy_str.format(prefix=prefix, sep=prefix_sep, level=v) for dummy_str, v in zip(dummy_strs, levels) ] else: dummy_cols = levels if isinstance(data, Series): index = data.index else: index = None if sparse: sparse_series = {} N = len(data) sp_indices = [[] for _ in range(len(dummy_cols))] for ndx, code in enumerate(codes): if code == -1: # Blank entries if not dummy_na and code == -1, #GH4446 continue sp_indices[code].append(ndx) if drop_first: # remove first categorical level to avoid perfect collinearity # GH12042 sp_indices = sp_indices[1:] dummy_cols = dummy_cols[1:] for col, ixs in zip(dummy_cols, sp_indices): sarr = SparseArray(np.ones(len(ixs), dtype=np.uint8), sparse_index=IntIndex(N, ixs), fill_value=0, dtype=np.uint8) sparse_series[col] = SparseSeries(data=sarr, index=index) out = SparseDataFrame(sparse_series, index=index, columns=dummy_cols, default_fill_value=0, dtype=np.uint8) return out else: dummy_mat = np.eye(number_of_cols, dtype=np.uint8).take(codes, axis=0) if not dummy_na: # reset NaN GH4446 dummy_mat[codes == -1] = 0 if drop_first: # remove first GH12042 dummy_mat = dummy_mat[:, 1:] dummy_cols = dummy_cols[1:] return DataFrame(dummy_mat, index=index, columns=dummy_cols)
if not args.base_category: print( "Warning: you did not specify a base category. This can take *very* long time to complete. See matcher -h for help." ) # load product csv file print("Parsing input file: %s" % product_file) product_data = pd.read_csv(product_file, sep=',', usecols=product_columns + [google_category_column]) print("Processing %d rows ..." % product_data.shape[0]) # if target google category column doesnt exist in file: add if not google_category_column in product_data.columns: product_data[google_category_column] = Series() # iterate through data row by row and match category index = 1 replacements = 0 for row_index, row in product_data.iterrows(): index += 1 if index % 10 == 0: print("Progress: %d rows finished" % index) p = {} for col in product_columns: value = safe_get(row, col) if value: p[col] = row.get(col) gcat = safe_get(row, google_category_column)
def str_extract(arr, pat, flags=0): """ Find groups in each string using passed regular expression Parameters ---------- pat : string Pattern or regular expression flags : int, default 0 (no flags) re module flags, e.g. re.IGNORECASE Returns ------- extracted groups : Series (one group) or DataFrame (multiple groups) Note that dtype of the result is always object, even when no match is found and the result is a Series or DataFrame containing only NaN values. Examples -------- A pattern with one group will return a Series. Non-matches will be NaN. >>> Series(['a1', 'b2', 'c3']).str.extract('[ab](\d)') 0 1 1 2 2 NaN dtype: object A pattern with more than one group will return a DataFrame. >>> Series(['a1', 'b2', 'c3']).str.extract('([ab])(\d)') 0 1 0 a 1 1 b 2 2 NaN NaN A pattern may contain optional groups. >>> Series(['a1', 'b2', 'c3']).str.extract('([ab])?(\d)') 0 1 0 a 1 1 b 2 2 NaN 3 Named groups will become column names in the result. >>> Series(['a1', 'b2', 'c3']).str.extract('(?P<letter>[ab])(?P<digit>\d)') letter digit 0 a 1 1 b 2 2 NaN NaN """ from pandas.core.series import Series from pandas.core.frame import DataFrame regex = re.compile(pat, flags=flags) # just to be safe, check this if regex.groups == 0: raise ValueError("This pattern contains no groups to capture.") empty_row = [np.nan] * regex.groups def f(x): if not isinstance(x, compat.string_types): return empty_row m = regex.search(x) if m: return [np.nan if item is None else item for item in m.groups()] else: return empty_row if regex.groups == 1: result = Series([f(val)[0] for val in arr], name=_get_single_group_name(regex), index=arr.index, dtype=object) else: names = dict(zip(regex.groupindex.values(), regex.groupindex.keys())) columns = [names.get(1 + i, i) for i in range(regex.groups)] if arr.empty: result = DataFrame(columns=columns, dtype=object) else: result = DataFrame([f(val) for val in arr], columns=columns, index=arr.index, dtype=object) return result
def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False): """ Convert categorical variable into dummy/indicator variables Parameters ---------- data : array-like or Series prefix : string, default None String to append DataFrame column names prefix_sep : string, default '_' If appending prefix, separator/delimiter to use dummy_na : bool, default False Add a column to indicate NaNs, if False NaNs are ignored. Returns ------- dummies : DataFrame Examples -------- >>> import pandas as pd >>> s = pd.Series(list('abca')) >>> get_dummies(s) a b c 0 1 0 0 1 0 1 0 2 0 0 1 3 1 0 0 >>> s1 = ['a', 'b', np.nan] >>> get_dummies(s1) a b 0 1 0 1 0 1 2 0 0 >>> get_dummies(s1, dummy_na=True) a b NaN 0 1 0 0 1 0 1 0 2 0 0 1 See also ``Series.str.get_dummies``. """ # Series avoids inconsistent NaN handling cat = Categorical.from_array(Series(data)) levels = cat.levels # if all NaN if not dummy_na and len(levels) == 0: if isinstance(data, Series): index = data.index else: index = np.arange(len(data)) return DataFrame(index=index) number_of_cols = len(levels) if dummy_na: number_of_cols += 1 dummy_mat = np.eye(number_of_cols).take(cat.codes, axis=0) if dummy_na: levels = np.append(cat.levels, np.nan) else: # reset NaN GH4446 dummy_mat[cat.codes == -1] = 0 if prefix is not None: dummy_cols = ['%s%s%s' % (prefix, prefix_sep, v) for v in levels] else: dummy_cols = levels if isinstance(data, Series): index = data.index else: index = None return DataFrame(dummy_mat, index=index, columns=dummy_cols)
def _add_margins( table, data, values, rows, cols, aggfunc, observed=None, margins_name="All", fill_value=None, ): if not isinstance(margins_name, str): raise ValueError("margins_name argument must be a string") msg = 'Conflicting name "{name}" in margins'.format(name=margins_name) for level in table.index.names: if margins_name in table.index.get_level_values(level): raise ValueError(msg) grand_margin = _compute_grand_margin(data, values, aggfunc, margins_name) # could be passed a Series object with no 'columns' if hasattr(table, "columns"): for level in table.columns.names[1:]: if margins_name in table.columns.get_level_values(level): raise ValueError(msg) if len(rows) > 1: key = (margins_name, ) + ("", ) * (len(rows) - 1) else: key = margins_name if not values and isinstance(table, ABCSeries): # If there are no values and the table is a series, then there is only # one column in the data. Compute grand margin and return it. return table.append(Series({key: grand_margin[margins_name]})) if values: marginal_result_set = _generate_marginal_results( table, data, values, rows, cols, aggfunc, observed, grand_margin, margins_name, ) if not isinstance(marginal_result_set, tuple): return marginal_result_set result, margin_keys, row_margin = marginal_result_set else: marginal_result_set = _generate_marginal_results_without_values( table, data, rows, cols, aggfunc, observed, margins_name) if not isinstance(marginal_result_set, tuple): return marginal_result_set result, margin_keys, row_margin = marginal_result_set row_margin = row_margin.reindex(result.columns, fill_value=fill_value) # populate grand margin for k in margin_keys: if isinstance(k, str): row_margin[k] = grand_margin[k] else: row_margin[k] = grand_margin[k[0]] from pandas import DataFrame margin_dummy = DataFrame(row_margin, columns=[key]).T row_names = result.index.names try: for dtype in set(result.dtypes): cols = result.select_dtypes([dtype]).columns margin_dummy[cols] = margin_dummy[cols].astype(dtype) result = result.append(margin_dummy) except TypeError: # we cannot reshape, so coerce the axis result.index = result.index._to_safe_for_reshape() result = result.append(margin_dummy) result.index.names = row_names return result
def __init__(self, data, index=None, sparse_index=None, kind='block', fill_value=None, name=None, dtype=None, copy=False, fastpath=False): # we are called internally, so short-circuit if fastpath: # data is an ndarray, index is defined data = SingleBlockManager(data, index, fastpath=True) if copy: data = data.copy() else: is_sparse_array = isinstance(data, SparseArray) if fill_value is None: if is_sparse_array: fill_value = data.fill_value else: fill_value = nan if is_sparse_array: if isinstance(data, SparseSeries) and index is None: index = data.index.view() elif index is not None: assert(len(index) == len(data)) sparse_index = data.sp_index data = np.asarray(data) elif isinstance(data, SparseSeries): if index is None: index = data.index.view() # extract the SingleBlockManager data = data._data elif isinstance(data, (Series, dict)): if index is None: index = data.index.view() data = Series(data) data, sparse_index = make_sparse(data, kind=kind, fill_value=fill_value) elif isinstance(data, (tuple, list, np.ndarray)): # array-like if sparse_index is None: data, sparse_index = make_sparse(data, kind=kind, fill_value=fill_value) else: assert(len(data) == sparse_index.npoints) elif isinstance(data, SingleBlockManager): if dtype is not None: data = data.astype(dtype) if index is None: index = data.index.view() else: data = data.reindex(index, copy=False) else: length = len(index) if data == fill_value or (isnull(data) and isnull(fill_value)): if kind == 'block': sparse_index = BlockIndex(length, [], []) else: sparse_index = IntIndex(length, []) data = np.array([]) else: if kind == 'block': locs, lens = ([0], [length]) if length else ([], []) sparse_index = BlockIndex(length, locs, lens) else: sparse_index = IntIndex(length, index) v = data data = np.empty(length) data.fill(v) if index is None: index = com._default_index(sparse_index.length) index = _ensure_index(index) # create/copy the manager if isinstance(data, SingleBlockManager): if copy: data = data.copy() else: # create a sparse array if not isinstance(data, SparseArray): data = SparseArray( data, sparse_index=sparse_index, fill_value=fill_value, dtype=dtype, copy=copy) data = SingleBlockManager(data, index) generic.NDFrame.__init__(self, data) self.index = index self.name = name
def dict_to_mgr( data: dict, index, columns, *, dtype: DtypeObj | None = None, typ: str = "block", copy: bool = True, ) -> Manager: """ Segregate Series based on type and coerce into matrices. Needs to handle a lot of exceptional cases. Used in DataFrame.__init__ """ arrays: Sequence[Any] | Series if columns is not None: from pandas.core.series import Series arrays = Series(data, index=columns, dtype=object) data_names = arrays.index missing = arrays.isna() if index is None: # GH10856 # raise ValueError if only scalars in dict index = extract_index(arrays[~missing]) else: index = ensure_index(index) # no obvious "empty" int column if missing.any() and not is_integer_dtype(dtype): nan_dtype: DtypeObj if dtype is None or (isinstance(dtype, np.dtype) and np.issubdtype(dtype, np.flexible)): # GH#1783 nan_dtype = np.dtype("object") else: nan_dtype = dtype val = construct_1d_arraylike_from_scalar(np.nan, len(index), nan_dtype) arrays.loc[missing] = [val] * missing.sum() arrays = list(arrays) else: keys = list(data.keys()) columns = data_names = Index(keys) arrays = [com.maybe_iterable_to_list(data[k]) for k in keys] # GH#24096 need copy to be deep for datetime64tz case # TODO: See if we can avoid these copies arrays = [ arr if not isinstance(arr, ABCIndex) else arr._data for arr in arrays ] arrays = [ arr if not is_datetime64tz_dtype(arr) else arr.copy() for arr in arrays ] if copy: # arrays_to_mgr (via form_blocks) won't make copies for EAs # dtype attr check to exclude EADtype-castable strs arrays = [ x if not hasattr(x, "dtype") or not isinstance(x.dtype, ExtensionDtype) else x.copy() for x in arrays ] # TODO: can we get rid of the dt64tz special case above? return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype, typ=typ, consolidate=copy)
def _get_dummies_1d( data, prefix, prefix_sep="_", dummy_na=False, sparse=False, drop_first=False, dtype=None, ): from pandas.core.reshape.concat import concat # Series avoids inconsistent NaN handling codes, levels = _factorize_from_iterable(Series(data)) if dtype is None: dtype = np.uint8 dtype = np.dtype(dtype) if is_object_dtype(dtype): raise ValueError("dtype=object is not a valid dtype for get_dummies") def get_empty_frame(data): if isinstance(data, Series): index = data.index else: index = np.arange(len(data)) return DataFrame(index=index) # if all NaN if not dummy_na and len(levels) == 0: return get_empty_frame(data) codes = codes.copy() if dummy_na: codes[codes == -1] = len(levels) levels = np.append(levels, np.nan) # if dummy_na, we just fake a nan level. drop_first will drop it again if drop_first and len(levels) == 1: return get_empty_frame(data) number_of_cols = len(levels) if prefix is None: dummy_cols = levels else: # PY2 embedded unicode, gh-22084 def _make_col_name(prefix, prefix_sep, level): fstr = "{prefix}{prefix_sep}{level}" return fstr.format(prefix=prefix, prefix_sep=prefix_sep, level=level) dummy_cols = [ _make_col_name(prefix, prefix_sep, level) for level in levels ] if isinstance(data, Series): index = data.index else: index = None if sparse: if is_integer_dtype(dtype): fill_value = 0 elif dtype == bool: fill_value = False else: fill_value = 0.0 sparse_series = [] N = len(data) sp_indices = [[] for _ in range(len(dummy_cols))] mask = codes != -1 codes = codes[mask] n_idx = np.arange(N)[mask] for ndx, code in zip(n_idx, codes): sp_indices[code].append(ndx) if drop_first: # remove first categorical level to avoid perfect collinearity # GH12042 sp_indices = sp_indices[1:] dummy_cols = dummy_cols[1:] for col, ixs in zip(dummy_cols, sp_indices): sarr = SparseArray( np.ones(len(ixs), dtype=dtype), sparse_index=IntIndex(N, ixs), fill_value=fill_value, dtype=dtype, ) sparse_series.append(Series(data=sarr, index=index, name=col)) out = concat(sparse_series, axis=1, copy=False) return out else: dummy_mat = np.eye(number_of_cols, dtype=dtype).take(codes, axis=0) if not dummy_na: # reset NaN GH4446 dummy_mat[codes == -1] = 0 if drop_first: # remove first GH12042 dummy_mat = dummy_mat[:, 1:] dummy_cols = dummy_cols[1:] return DataFrame(dummy_mat, index=index, columns=dummy_cols)
def _get_dummies_1d( data, prefix, prefix_sep="_", dummy_na: bool = False, sparse: bool = False, drop_first: bool = False, dtype: Dtype | None = None, ) -> DataFrame: from pandas.core.reshape.concat import concat # Series avoids inconsistent NaN handling codes, levels = factorize_from_iterable(Series(data)) if dtype is None: dtype = np.dtype(np.uint8) # error: Argument 1 to "dtype" has incompatible type "Union[ExtensionDtype, str, # dtype[Any], Type[object]]"; expected "Type[Any]" dtype = np.dtype(dtype) # type: ignore[arg-type] if is_object_dtype(dtype): raise ValueError("dtype=object is not a valid dtype for get_dummies") def get_empty_frame(data) -> DataFrame: if isinstance(data, Series): index = data.index else: index = np.arange(len(data)) return DataFrame(index=index) # if all NaN if not dummy_na and len(levels) == 0: return get_empty_frame(data) codes = codes.copy() if dummy_na: codes[codes == -1] = len(levels) levels = np.append(levels, np.nan) # if dummy_na, we just fake a nan level. drop_first will drop it again if drop_first and len(levels) == 1: return get_empty_frame(data) number_of_cols = len(levels) if prefix is None: dummy_cols = levels else: dummy_cols = Index( [f"{prefix}{prefix_sep}{level}" for level in levels]) index: Index | None if isinstance(data, Series): index = data.index else: index = None if sparse: fill_value: bool | float | int if is_integer_dtype(dtype): fill_value = 0 elif dtype == np.dtype(bool): fill_value = False else: fill_value = 0.0 sparse_series = [] N = len(data) sp_indices: list[list] = [[] for _ in range(len(dummy_cols))] mask = codes != -1 codes = codes[mask] n_idx = np.arange(N)[mask] for ndx, code in zip(n_idx, codes): sp_indices[code].append(ndx) if drop_first: # remove first categorical level to avoid perfect collinearity # GH12042 sp_indices = sp_indices[1:] dummy_cols = dummy_cols[1:] for col, ixs in zip(dummy_cols, sp_indices): sarr = SparseArray( np.ones(len(ixs), dtype=dtype), sparse_index=IntIndex(N, ixs), fill_value=fill_value, dtype=dtype, ) sparse_series.append(Series(data=sarr, index=index, name=col)) return concat(sparse_series, axis=1, copy=False) else: # take on axis=1 + transpose to ensure ndarray layout is column-major dummy_mat = np.eye(number_of_cols, dtype=dtype).take(codes, axis=1).T if not dummy_na: # reset NaN GH4446 dummy_mat[codes == -1] = 0 if drop_first: # remove first GH12042 dummy_mat = dummy_mat[:, 1:] dummy_cols = dummy_cols[1:] return DataFrame(dummy_mat, index=index, columns=dummy_cols)
def aggregate(self, func_or_funcs, *args, **kwargs): """ Apply aggregation function or functions to groups, yielding most likely Series but in some cases DataFrame depending on the output of the aggregation function Parameters ---------- func_or_funcs : function or list / dict of functions List/dict of functions will produce DataFrame with column names determined by the function names themselves (list) or the keys in the dict Notes ----- agg is an alias for aggregate. Use it. Example ------- >>> series bar 1.0 baz 2.0 qot 3.0 qux 4.0 >>> mapper = lambda x: x[0] # first letter >>> grouped = series.groupby(mapper) >>> grouped.aggregate(np.sum) b 3.0 q 7.0 >>> grouped.aggregate([np.sum, np.mean, np.std]) mean std sum b 1.5 0.5 3 q 3.5 0.5 7 >>> grouped.agg({'result' : lambda x: x.mean() / x.std(), ... 'total' : np.sum}) result total b 2.121 3 q 4.95 7 See also -------- apply, transform Returns ------- Series or DataFrame """ if isinstance(func_or_funcs, basestring): return getattr(self, func_or_funcs)(*args, **kwargs) if hasattr(func_or_funcs, '__iter__'): ret = self._aggregate_multiple_funcs(func_or_funcs) else: if len(self.groupings) > 1: return self._python_agg_general(func_or_funcs, *args, **kwargs) try: return self._python_agg_general(func_or_funcs, *args, **kwargs) except Exception: result = self._aggregate_named(func_or_funcs, *args, **kwargs) index = Index(sorted(result), name=self.groupings[0].name) ret = Series(result, index=index) if not self.as_index: # pragma: no cover print 'Warning, ignoring as_index=True' return ret