def test_empty(self): # product of empty factors X = [[], [0, 1], []] Y = [[], [], ['a', 'b', 'c']] for x, y in zip(X, Y): expected1 = np.array([], dtype=np.asarray(x).dtype) expected2 = np.array([], dtype=np.asarray(y).dtype) result1, result2 = cartesian_product([x, y]) tm.assert_numpy_array_equal(result1, expected1) tm.assert_numpy_array_equal(result2, expected2) # empty product (empty input): result = cartesian_product([]) expected = [] tm.assert_equal(result, expected)
def test_simple(self): x, y = list('ABC'), [1, 22] result1, result2 = cartesian_product([x, y]) expected1 = np.array(['A', 'A', 'B', 'B', 'C', 'C']) expected2 = np.array([1, 22, 1, 22, 1, 22]) tm.assert_numpy_array_equal(result1, expected1) tm.assert_numpy_array_equal(result2, expected2)
def test_datetimeindex(self): # regression test for GitHub issue #6439 # make sure that the ordering on datetimeindex is consistent x = date_range('2000-01-01', periods=2) result = [Index(y).day for y in cartesian_product([x, x])] expected = [np.array([1, 1, 2, 2]), np.array([1, 2, 1, 2])] assert_equal(result, expected)
def _apply_1d(self, func, axis): axis_name = self._get_axis_name(axis) ax = self._get_axis(axis) ndim = self.ndim values = self.values # iter thru the axes slice_axis = self._get_axis(axis) slice_indexer = [0]*(ndim-1) indexer = np.zeros(ndim, 'O') indlist = list(range(ndim)) indlist.remove(axis) indexer[axis] = slice(None, None) indexer.put(indlist, slice_indexer) planes = [ self._get_axis(axi) for axi in indlist ] shape = np.array(self.shape).take(indlist) # all the iteration points points = cartesian_product(planes) results = [] for i in range(np.prod(shape)): # construct the object pts = tuple([ p[i] for p in points ]) indexer.put(indlist, slice_indexer) obj = Series(values[tuple(indexer)],index=slice_axis,name=pts) result = func(obj) results.append(result) # increment the indexer slice_indexer[-1] += 1 n = -1 while (slice_indexer[n] >= shape[n]) and (n > (1-ndim)): slice_indexer[n-1] += 1 slice_indexer[n] = 0 n -= 1 # empty object if not len(results): return self._constructor(**self._construct_axes_dict()) # same ndim as current if isinstance(results[0],Series): arr = np.vstack([ r.values for r in results ]) arr = arr.T.reshape(tuple([len(slice_axis)] + list(shape))) tranp = np.array([axis]+indlist).argsort() arr = arr.transpose(tuple(list(tranp))) return self._constructor(arr,**self._construct_axes_dict()) # ndim-1 shape results = np.array(results).reshape(shape) if results.ndim == 2 and axis_name != self._info_axis_name: results = results.T planes = planes[::-1] return self._construct_return_type(results,planes)
def test_simple(self): x, y = list('ABC'), [1, 22] result = cartesian_product([x, y]) expected = [ np.array(['A', 'A', 'B', 'B', 'C', 'C']), np.array([1, 22, 1, 22, 1, 22]) ] assert_equal(result, expected)
def test_datetimeindex(self): # regression test for GitHub issue #6439 # make sure that the ordering on datetimeindex is consistent x = date_range('2000-01-01', periods=2) result1, result2 = [Index(y).day for y in cartesian_product([x, x])] expected1 = np.array([1, 1, 2, 2], dtype=np.int32) expected2 = np.array([1, 2, 1, 2], dtype=np.int32) tm.assert_numpy_array_equal(result1, expected1) tm.assert_numpy_array_equal(result2, expected2)
def setUp(self): tm._skip_if_no_scipy() import scipy.sparse # SparseSeries inputs used in tests, the tests rely on the order self.sparse_series = [] s = pd.Series([3.0, nan, 1.0, 2.0, nan, nan]) s.index = pd.MultiIndex.from_tuples([(1, 2, 'a', 0), (1, 2, 'a', 1), (1, 1, 'b', 0), (1, 1, 'b', 1), (2, 1, 'b', 0), (2, 1, 'b', 1)], names=['A', 'B', 'C', 'D']) self.sparse_series.append(s.to_sparse()) ss = self.sparse_series[0].copy() ss.index.names = [3, 0, 1, 2] self.sparse_series.append(ss) ss = pd.Series([nan] * 12, index=cartesian_product( (range(3), range(4)))).to_sparse() for k, v in zip([(0, 0), (1, 2), (1, 3)], [3.0, 1.0, 2.0]): ss[k] = v self.sparse_series.append(ss) # results used in tests self.coo_matrices = [] self.coo_matrices.append( scipy.sparse.coo_matrix(([3.0, 1.0, 2.0], ([0, 1, 1], [0, 2, 3])), shape=(3, 4))) self.coo_matrices.append( scipy.sparse.coo_matrix(([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])), shape=(3, 4))) self.coo_matrices.append( scipy.sparse.coo_matrix(([3.0, 1.0, 2.0], ([0, 1, 1], [0, 0, 1])), shape=(3, 2))) self.ils = [[(1, 2), (1, 1), (2, 1)], [(1, 1), (1, 2), (2, 1)], [(1, 2, 'a'), (1, 1, 'b'), (2, 1, 'b')]] self.jls = [[('a', 0), ('a', 1), ('b', 0), ('b', 1)], [0, 1]]
def setUp(self): tm._skip_if_no_scipy() import scipy.sparse # SparseSeries inputs used in tests, the tests rely on the order self.sparse_series = [] s = pd.Series([3.0, nan, 1.0, 2.0, nan, nan]) s.index = pd.MultiIndex.from_tuples([(1, 2, 'a', 0), (1, 2, 'a', 1), (1, 1, 'b', 0), (1, 1, 'b', 1), (2, 1, 'b', 0), (2, 1, 'b', 1)], names=['A', 'B', 'C', 'D']) self.sparse_series.append(s.to_sparse()) ss = self.sparse_series[0].copy() ss.index.names = [3, 0, 1, 2] self.sparse_series.append(ss) ss = pd.Series([ nan ] * 12, index=cartesian_product((range(3), range(4)))).to_sparse() for k, v in zip([(0, 0), (1, 2), (1, 3)], [3.0, 1.0, 2.0]): ss[k] = v self.sparse_series.append(ss) # results used in tests self.coo_matrices = [] self.coo_matrices.append(scipy.sparse.coo_matrix( ([3.0, 1.0, 2.0], ([0, 1, 1], [0, 2, 3])), shape=(3, 4))) self.coo_matrices.append(scipy.sparse.coo_matrix( ([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])), shape=(3, 4))) self.coo_matrices.append(scipy.sparse.coo_matrix( ([3.0, 1.0, 2.0], ([0, 1, 1], [0, 0, 1])), shape=(3, 2))) self.ils = [[(1, 2), (1, 1), (2, 1)], [(1, 1), (1, 2), (2, 1)], [(1, 2, 'a'), (1, 1, 'b'), (2, 1, 'b')]] self.jls = [[('a', 0), ('a', 1), ('b', 0), ('b', 1)], [0, 1]]
def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', fill_value=None, margins=False, dropna=True): """ Create a spreadsheet-style pivot table as a DataFrame. The levels in the pivot table will be stored in MultiIndex objects (hierarchical indexes) on the index and columns of the result DataFrame Parameters ---------- data : DataFrame values : column to aggregate, optional index : a column, Grouper, array which has the same length as data, or list of them. Keys to group by on the pivot table index. If an array is passed, it is being used as the same manner as column values. columns : a column, Grouper, array which has the same length as data, or list of them. Keys to group by on the pivot table column. If an array is passed, it is being used as the same manner as column values. aggfunc : function, default numpy.mean, or list of functions If list of functions passed, the resulting pivot table will have hierarchical columns whose top level are the function names (inferred from the function objects themselves) fill_value : scalar, default None Value to replace missing values with margins : boolean, default False Add all row / columns (e.g. for subtotal / grand totals) dropna : boolean, default True Do not include columns whose entries are all NaN rows : kwarg only alias of index [deprecated] cols : kwarg only alias of columns [deprecated] Examples -------- >>> df A B C D 0 foo one small 1 1 foo one large 2 2 foo one large 2 3 foo two small 3 4 foo two small 3 5 bar one large 4 6 bar one small 5 7 bar two small 6 8 bar two large 7 >>> table = pivot_table(df, values='D', index=['A', 'B'], ... columns=['C'], aggfunc=np.sum) >>> table small large foo one 1 4 two 6 NaN bar one 5 4 two 6 7 Returns ------- table : DataFrame """ index = _convert_by(index) columns = _convert_by(columns) if isinstance(aggfunc, list): pieces = [] keys = [] for func in aggfunc: table = pivot_table(data, values=values, index=index, columns=columns, fill_value=fill_value, aggfunc=func, margins=margins) pieces.append(table) keys.append(func.__name__) return concat(pieces, keys=keys, axis=1) keys = index + columns values_passed = values is not None if values_passed: if isinstance(values, (list, tuple)): values_multi = True else: values_multi = False values = [values] else: values = list(data.columns.drop(keys)) if values_passed: to_filter = [] for x in keys + values: if isinstance(x, Grouper): x = x.key try: if x in data: to_filter.append(x) except TypeError: pass if len(to_filter) < len(data.columns): data = data[to_filter] grouped = data.groupby(keys) agged = grouped.agg(aggfunc) table = agged if table.index.nlevels > 1: to_unstack = [ agged.index.names[i] or i for i in range(len(index), len(keys)) ] table = agged.unstack(to_unstack) if not dropna: try: m = MultiIndex.from_arrays(cartesian_product(table.index.levels)) table = table.reindex_axis(m, axis=0) except AttributeError: pass # it's a single level try: m = MultiIndex.from_arrays(cartesian_product(table.columns.levels)) table = table.reindex_axis(m, axis=1) except AttributeError: pass # it's a single level or a series if isinstance(table, DataFrame): if isinstance(table.columns, MultiIndex): table = table.sortlevel(axis=1) else: table = table.sort_index(axis=1) if fill_value is not None: table = table.fillna(value=fill_value, downcast='infer') if margins: table = _add_margins(table, data, values, rows=index, cols=columns, aggfunc=aggfunc) # discard the top level if values_passed and not values_multi: table = table[values[0]] if len(index) == 0 and len(columns) > 0: table = table.T return table
def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', fill_value=None, margins=False, dropna=True, **kwarg): """ Create a spreadsheet-style pivot table as a DataFrame. The levels in the pivot table will be stored in MultiIndex objects (hierarchical indexes) on the index and columns of the result DataFrame Parameters ---------- data : DataFrame values : column to aggregate, optional index : list of column names or arrays to group on Keys to group on the x-axis of the pivot table columns : list of column names or arrays to group on Keys to group on the y-axis of the pivot table aggfunc : function, default numpy.mean, or list of functions If list of functions passed, the resulting pivot table will have hierarchical columns whose top level are the function names (inferred from the function objects themselves) fill_value : scalar, default None Value to replace missing values with margins : boolean, default False Add all row / columns (e.g. for subtotal / grand totals) dropna : boolean, default True Do not include columns whose entries are all NaN rows : kwarg only alias of index [deprecated] cols : kwarg only alias of columns [deprecated] Examples -------- >>> df A B C D 0 foo one small 1 1 foo one large 2 2 foo one large 2 3 foo two small 3 4 foo two small 3 5 bar one large 4 6 bar one small 5 7 bar two small 6 8 bar two large 7 >>> table = pivot_table(df, values='D', index=['A', 'B'], ... columns=['C'], aggfunc=np.sum) >>> table small large foo one 1 4 two 6 NaN bar one 5 4 two 6 7 Returns ------- table : DataFrame """ # Parse old-style keyword arguments rows = kwarg.pop('rows', None) if rows is not None: warnings.warn("rows is deprecated, use index", FutureWarning) if index is None: index = rows else: msg = "Can only specify either 'rows' or 'index'" raise TypeError(msg) cols = kwarg.pop('cols', None) if cols is not None: warnings.warn("cols is deprecated, use columns", FutureWarning) if columns is None: columns = cols else: msg = "Can only specify either 'cols' or 'columns'" raise TypeError(msg) if kwarg: raise TypeError("Unexpected argument(s): %s" % kwarg.keys()) index = _convert_by(index) columns = _convert_by(columns) if isinstance(aggfunc, list): pieces = [] keys = [] for func in aggfunc: table = pivot_table(data, values=values, index=index, columns=columns, fill_value=fill_value, aggfunc=func, margins=margins) pieces.append(table) keys.append(func.__name__) return concat(pieces, keys=keys, axis=1) keys = index + columns values_passed = values is not None if values_passed: if isinstance(values, (list, tuple)): values_multi = True else: values_multi = False values = [values] else: values = list(data.columns.drop(keys)) if values_passed: to_filter = [] for x in keys + values: try: if x in data: to_filter.append(x) except TypeError: pass if len(to_filter) < len(data.columns): data = data[to_filter] grouped = data.groupby(keys) agged = grouped.agg(aggfunc) table = agged if table.index.nlevels > 1: to_unstack = [agged.index.names[i] for i in range(len(index), len(keys))] table = agged.unstack(to_unstack) if not dropna: try: m = MultiIndex.from_arrays(cartesian_product(table.index.levels)) table = table.reindex_axis(m, axis=0) except AttributeError: pass # it's a single level try: m = MultiIndex.from_arrays(cartesian_product(table.columns.levels)) table = table.reindex_axis(m, axis=1) except AttributeError: pass # it's a single level or a series if isinstance(table, DataFrame): if isinstance(table.columns, MultiIndex): table = table.sortlevel(axis=1) else: table = table.sort_index(axis=1) if fill_value is not None: table = table.fillna(value=fill_value, downcast='infer') if margins: table = _add_margins(table, data, values, rows=index, cols=columns, aggfunc=aggfunc) # discard the top level if values_passed and not values_multi: table = table[values[0]] if len(index) == 0 and len(columns) > 0: table = table.T return table
def test_simple(self): x, y = list('ABC'), [1, 22] result = cartesian_product([x, y]) expected = [np.array(['A', 'A', 'B', 'B', 'C', 'C']), np.array([ 1, 22, 1, 22, 1, 22])] assert_equal(result, expected)
def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', fill_value=None, margins=False, dropna=True, margins_name='All'): """ Create a spreadsheet-style pivot table as a DataFrame. The levels in the pivot table will be stored in MultiIndex objects (hierarchical indexes) on the index and columns of the result DataFrame Parameters ---------- data : DataFrame values : column to aggregate, optional index : column, Grouper, array, or list of the previous If an array is passed, it must be the same length as the data. The list can contain any of the other types (except list). Keys to group by on the pivot table index. If an array is passed, it is being used as the same manner as column values. columns : column, Grouper, array, or list of the previous If an array is passed, it must be the same length as the data. The list can contain any of the other types (except list). Keys to group by on the pivot table column. If an array is passed, it is being used as the same manner as column values. aggfunc : function or list of functions, default numpy.mean If list of functions passed, the resulting pivot table will have hierarchical columns whose top level are the function names (inferred from the function objects themselves) fill_value : scalar, default None Value to replace missing values with margins : boolean, default False Add all row / columns (e.g. for subtotal / grand totals) dropna : boolean, default True Do not include columns whose entries are all NaN margins_name : string, default 'All' Name of the row / column that will contain the totals when margins is True. Examples -------- >>> df A B C D 0 foo one small 1 1 foo one large 2 2 foo one large 2 3 foo two small 3 4 foo two small 3 5 bar one large 4 6 bar one small 5 7 bar two small 6 8 bar two large 7 >>> table = pivot_table(df, values='D', index=['A', 'B'], ... columns=['C'], aggfunc=np.sum) >>> table small large foo one 1 4 two 6 NaN bar one 5 4 two 6 7 Returns ------- table : DataFrame """ index = _convert_by(index) columns = _convert_by(columns) if isinstance(aggfunc, list): pieces = [] keys = [] for func in aggfunc: table = pivot_table(data, values=values, index=index, columns=columns, fill_value=fill_value, aggfunc=func, margins=margins) pieces.append(table) keys.append(func.__name__) return concat(pieces, keys=keys, axis=1) keys = index + columns values_passed = values is not None if values_passed: if com.is_list_like(values): values_multi = True values = list(values) else: values_multi = False values = [values] else: values = list(data.columns.drop(keys)) if values_passed: to_filter = [] for x in keys + values: if isinstance(x, Grouper): x = x.key try: if x in data: to_filter.append(x) except TypeError: pass if len(to_filter) < len(data.columns): data = data[to_filter] grouped = data.groupby(keys) agged = grouped.agg(aggfunc) table = agged if table.index.nlevels > 1: to_unstack = [agged.index.names[i] or i for i in range(len(index), len(keys))] table = agged.unstack(to_unstack) if not dropna: try: m = MultiIndex.from_arrays(cartesian_product(table.index.levels)) table = table.reindex_axis(m, axis=0) except AttributeError: pass # it's a single level try: m = MultiIndex.from_arrays(cartesian_product(table.columns.levels)) table = table.reindex_axis(m, axis=1) except AttributeError: pass # it's a single level or a series if isinstance(table, DataFrame): if isinstance(table.columns, MultiIndex): table = table.sortlevel(axis=1) else: table = table.sort_index(axis=1) if fill_value is not None: table = table.fillna(value=fill_value, downcast='infer') if margins: table = _add_margins(table, data, values, rows=index, cols=columns, aggfunc=aggfunc, margins_name=margins_name) # discard the top level if values_passed and not values_multi: table = table[values[0]] if len(index) == 0 and len(columns) > 0: table = table.T return table
def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', fill_value=None, margins=False, dropna=True, margins_name='All'): """ Create a spreadsheet-style pivot table as a DataFrame. The levels in the pivot table will be stored in MultiIndex objects (hierarchical indexes) on the index and columns of the result DataFrame Parameters ---------- data : DataFrame values : column to aggregate, optional index : column, Grouper, array, or list of the previous If an array is passed, it must be the same length as the data. The list can contain any of the other types (except list). Keys to group by on the pivot table index. If an array is passed, it is being used as the same manner as column values. columns : column, Grouper, array, or list of the previous If an array is passed, it must be the same length as the data. The list can contain any of the other types (except list). Keys to group by on the pivot table column. If an array is passed, it is being used as the same manner as column values. aggfunc : function or list of functions, default numpy.mean If list of functions passed, the resulting pivot table will have hierarchical columns whose top level are the function names (inferred from the function objects themselves) fill_value : scalar, default None Value to replace missing values with margins : boolean, default False Add all row / columns (e.g. for subtotal / grand totals) dropna : boolean, default True Do not include columns whose entries are all NaN margins_name : string, default 'All' Name of the row / column that will contain the totals when margins is True. Examples -------- >>> df A B C D 0 foo one small 1 1 foo one large 2 2 foo one large 2 3 foo two small 3 4 foo two small 3 5 bar one large 4 6 bar one small 5 7 bar two small 6 8 bar two large 7 >>> table = pivot_table(df, values='D', index=['A', 'B'], ... columns=['C'], aggfunc=np.sum) >>> table small large foo one 1 4 two 6 NaN bar one 5 4 two 6 7 Returns ------- table : DataFrame See also -------- DataFrame.pivot : pivot without aggregation that can handle non-numeric data """ index = _convert_by(index) columns = _convert_by(columns) if isinstance(aggfunc, list): pieces = [] keys = [] for func in aggfunc: table = pivot_table(data, values=values, index=index, columns=columns, fill_value=fill_value, aggfunc=func, margins=margins, margins_name=margins_name) pieces.append(table) keys.append(func.__name__) return concat(pieces, keys=keys, axis=1) keys = index + columns values_passed = values is not None if values_passed: if is_list_like(values): values_multi = True values = list(values) else: values_multi = False values = [values] # GH14938 Make sure value labels are in data for i in values: if i not in data: raise KeyError(i) to_filter = [] for x in keys + values: if isinstance(x, Grouper): x = x.key try: if x in data: to_filter.append(x) except TypeError: pass if len(to_filter) < len(data.columns): data = data[to_filter] else: values = data.columns for key in keys: try: values = values.drop(key) except (TypeError, ValueError): pass values = list(values) grouped = data.groupby(keys) agged = grouped.agg(aggfunc) table = agged if table.index.nlevels > 1: to_unstack = [ agged.index.names[i] or i for i in range(len(index), len(keys)) ] table = agged.unstack(to_unstack) if not dropna: try: m = MultiIndex.from_arrays(cartesian_product(table.index.levels), names=table.index.names) table = table.reindex_axis(m, axis=0) except AttributeError: pass # it's a single level try: m = MultiIndex.from_arrays(cartesian_product(table.columns.levels), names=table.columns.names) table = table.reindex_axis(m, axis=1) except AttributeError: pass # it's a single level or a series if isinstance(table, DataFrame): if isinstance(table.columns, MultiIndex): table = table.sortlevel(axis=1) else: table = table.sort_index(axis=1) if fill_value is not None: table = table.fillna(value=fill_value, downcast='infer') if margins: if dropna: data = data[data.notnull().all(axis=1)] table = _add_margins(table, data, values, rows=index, cols=columns, aggfunc=aggfunc, margins_name=margins_name) # discard the top level if values_passed and not values_multi and not table.empty: table = table[values[0]] if len(index) == 0 and len(columns) > 0: table = table.T return table
def test_simple(self): x, y = list("ABC"), [1, 22] result = cartesian_product([x, y]) expected = [np.array(["A", "A", "B", "B", "C", "C"]), np.array([1, 22, 1, 22, 1, 22])] assert_equal(result, expected)