def test_groupby_categorical_no_compress(self): data = Series(np.random.randn(9)) codes = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]) cats = Categorical.from_codes(codes, [0, 1, 2], ordered=True) result = data.groupby(cats).mean() exp = data.groupby(codes).mean() exp.index = CategoricalIndex(exp.index, categories=cats.categories, ordered=cats.ordered) assert_series_equal(result, exp) codes = np.array([0, 0, 0, 1, 1, 1, 3, 3, 3]) cats = Categorical.from_codes(codes, [0, 1, 2, 3], ordered=True) result = data.groupby(cats).mean() exp = data.groupby(codes).mean().reindex(cats.categories) exp.index = CategoricalIndex(exp.index, categories=cats.categories, ordered=cats.ordered) assert_series_equal(result, exp) cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"], categories=["a", "b", "c", "d"], ordered=True) data = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 4, 5], "b": cats}) result = data.groupby("b").mean() result = result["a"].values exp = np.array([1, 2, 4, np.nan]) self.assert_numpy_array_equal(result, exp)
def test_groupby_categorical_no_compress(self): data = Series(np.random.randn(9)) codes = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]) cats = Categorical.from_codes(codes, [0, 1, 2], ordered=True) result = data.groupby(cats).mean() exp = data.groupby(codes).mean() exp.index = CategoricalIndex(exp.index, categories=cats.categories, ordered=cats.ordered) assert_series_equal(result, exp) codes = np.array([0, 0, 0, 1, 1, 1, 3, 3, 3]) cats = Categorical.from_codes(codes, [0, 1, 2, 3], ordered=True) result = data.groupby(cats).mean() exp = data.groupby(codes).mean().reindex(cats.categories) exp.index = CategoricalIndex(exp.index, categories=cats.categories, ordered=cats.ordered) assert_series_equal(result, exp) cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"], categories=["a", "b", "c", "d"], ordered=True) data = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 4, 5], "b": cats}) result = data.groupby("b").mean() result = result["a"].values exp = np.array([1, 2, 4, np.nan]) self.assert_numpy_array_equal(result, exp)
def test_filter_against_workaround(self): np.random.seed(0) # Series of ints s = Series(np.random.randint(0, 100, 1000)) grouper = s.apply(lambda x: np.round(x, -1)) grouped = s.groupby(grouper) f = lambda x: x.mean() > 10 old_way = s[grouped.transform(f).astype('bool')] new_way = grouped.filter(f) assert_series_equal(new_way.sort_values(), old_way.sort_values()) # Series of floats s = 100 * Series(np.random.random(1000)) grouper = s.apply(lambda x: np.round(x, -1)) grouped = s.groupby(grouper) f = lambda x: x.mean() > 10 old_way = s[grouped.transform(f).astype('bool')] new_way = grouped.filter(f) assert_series_equal(new_way.sort_values(), old_way.sort_values()) # Set up DataFrame of ints, floats, strings. from string import ascii_lowercase letters = np.array(list(ascii_lowercase)) N = 1000 random_letters = letters.take(np.random.randint(0, 26, N)) df = DataFrame({ 'ints': Series(np.random.randint(0, 100, N)), 'floats': N / 10 * Series(np.random.random(N)), 'letters': Series(random_letters) }) # Group by ints; filter on floats. grouped = df.groupby('ints') old_way = df[grouped.floats.transform( lambda x: x.mean() > N / 20).astype('bool')] new_way = grouped.filter(lambda x: x['floats'].mean() > N / 20) assert_frame_equal(new_way, old_way) # Group by floats (rounded); filter on strings. grouper = df.floats.apply(lambda x: np.round(x, -1)) grouped = df.groupby(grouper) old_way = df[grouped.letters.transform( lambda x: len(x) < N / 10).astype('bool')] new_way = grouped.filter(lambda x: len(x.letters) < N / 10) assert_frame_equal(new_way, old_way) # Group by strings; filter on ints. grouped = df.groupby('letters') old_way = df[grouped.ints.transform( lambda x: x.mean() > N / 20).astype('bool')] new_way = grouped.filter(lambda x: x['ints'].mean() > N / 20) assert_frame_equal(new_way, old_way)
def test_filter_against_workaround(self): np.random.seed(0) # Series of ints s = Series(np.random.randint(0, 100, 1000)) grouper = s.apply(lambda x: np.round(x, -1)) grouped = s.groupby(grouper) f = lambda x: x.mean() > 10 old_way = s[grouped.transform(f).astype('bool')] new_way = grouped.filter(f) assert_series_equal(new_way.sort_values(), old_way.sort_values()) # Series of floats s = 100 * Series(np.random.random(1000)) grouper = s.apply(lambda x: np.round(x, -1)) grouped = s.groupby(grouper) f = lambda x: x.mean() > 10 old_way = s[grouped.transform(f).astype('bool')] new_way = grouped.filter(f) assert_series_equal(new_way.sort_values(), old_way.sort_values()) # Set up DataFrame of ints, floats, strings. from string import ascii_lowercase letters = np.array(list(ascii_lowercase)) N = 1000 random_letters = letters.take(np.random.randint(0, 26, N)) df = DataFrame({'ints': Series(np.random.randint(0, 100, N)), 'floats': N / 10 * Series(np.random.random(N)), 'letters': Series(random_letters)}) # Group by ints; filter on floats. grouped = df.groupby('ints') old_way = df[grouped.floats. transform(lambda x: x.mean() > N / 20).astype('bool')] new_way = grouped.filter(lambda x: x['floats'].mean() > N / 20) assert_frame_equal(new_way, old_way) # Group by floats (rounded); filter on strings. grouper = df.floats.apply(lambda x: np.round(x, -1)) grouped = df.groupby(grouper) old_way = df[grouped.letters. transform(lambda x: len(x) < N / 10).astype('bool')] new_way = grouped.filter(lambda x: len(x.letters) < N / 10) assert_frame_equal(new_way, old_way) # Group by strings; filter on ints. grouped = df.groupby('letters') old_way = df[grouped.ints. transform(lambda x: x.mean() > N / 20).astype('bool')] new_way = grouped.filter(lambda x: x['ints'].mean() > N / 20) assert_frame_equal(new_way, old_way)
def _init_spmatrix(self, data, index, columns, dtype=None, fill_value=None): """ Init self from scipy.sparse matrix """ index, columns = self._prep_index(data, index, columns) data = data.tocoo() N = len(index) # Construct a dict of SparseSeries sdict = {} values = Series(data.data, index=data.row, copy=False) for col, rowvals in values.groupby(data.col): # get_blocks expects int32 row indices in sorted order rowvals = rowvals.sort_index() rows = rowvals.index.values.astype(np.int32) blocs, blens = get_blocks(rows) sdict[columns[col]] = SparseSeries(rowvals.values, index=index, fill_value=fill_value, sparse_index=BlockIndex( N, blocs, blens)) # Add any columns that were empty and thus not grouped on above sdict.update({ column: SparseSeries(index=index, fill_value=fill_value, sparse_index=BlockIndex(N, [], [])) for column in columns if column not in sdict }) return self._init_dict(sdict, index, columns, dtype)
def _init_spmatrix(self, data, index, columns, dtype=None, fill_value=None): """ Init self from scipy.sparse matrix """ index, columns = self._prep_index(data, index, columns) data = data.tocoo() N = len(index) # Construct a dict of SparseSeries sdict = {} values = Series(data.data, index=data.row, copy=False) for col, rowvals in values.groupby(data.col): # get_blocks expects int32 row indices in sorted order rowvals = rowvals.sort_index() rows = rowvals.index.values.astype(np.int32) blocs, blens = get_blocks(rows) sdict[columns[col]] = SparseSeries( rowvals.values, index=index, fill_value=fill_value, sparse_index=BlockIndex(N, blocs, blens)) # Add any columns that were empty and thus not grouped on above sdict.update({column: SparseSeries(index=index, fill_value=fill_value, sparse_index=BlockIndex(N, [], [])) for column in columns if column not in sdict}) return self._init_dict(sdict, index, columns, dtype)
def test_groupby_categorical_unequal_len(self): # GH3011 series = Series([np.nan, np.nan, 1, 1, 2, 2, 3, 3, 4, 4]) # The raises only happens with categorical, not with series of types # category bins = pd.cut(series.dropna().values, 4) # len(bins) != len(series) here self.assertRaises(ValueError, lambda: series.groupby(bins).mean())
def test_groupby_categorical_unequal_len(self): # GH3011 series = Series([np.nan, np.nan, 1, 1, 2, 2, 3, 3, 4, 4]) # The raises only happens with categorical, not with series of types # category bins = pd.cut(series.dropna().values, 4) # len(bins) != len(series) here self.assertRaises(ValueError, lambda: series.groupby(bins).mean())