コード例 #1
0
    def test_groupby_categorical_no_compress(self):
        data = Series(np.random.randn(9))

        codes = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2])
        cats = Categorical.from_codes(codes, [0, 1, 2], ordered=True)

        result = data.groupby(cats).mean()
        exp = data.groupby(codes).mean()

        exp.index = CategoricalIndex(exp.index, categories=cats.categories,
                                     ordered=cats.ordered)
        assert_series_equal(result, exp)

        codes = np.array([0, 0, 0, 1, 1, 1, 3, 3, 3])
        cats = Categorical.from_codes(codes, [0, 1, 2, 3], ordered=True)

        result = data.groupby(cats).mean()
        exp = data.groupby(codes).mean().reindex(cats.categories)
        exp.index = CategoricalIndex(exp.index, categories=cats.categories,
                                     ordered=cats.ordered)
        assert_series_equal(result, exp)

        cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"],
                           categories=["a", "b", "c", "d"], ordered=True)
        data = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 4, 5], "b": cats})

        result = data.groupby("b").mean()
        result = result["a"].values
        exp = np.array([1, 2, 4, np.nan])
        self.assert_numpy_array_equal(result, exp)
コード例 #2
0
ファイル: test_categorical.py プロジェクト: sunyuzheng/pandas
    def test_groupby_categorical_no_compress(self):
        data = Series(np.random.randn(9))

        codes = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2])
        cats = Categorical.from_codes(codes, [0, 1, 2], ordered=True)

        result = data.groupby(cats).mean()
        exp = data.groupby(codes).mean()

        exp.index = CategoricalIndex(exp.index,
                                     categories=cats.categories,
                                     ordered=cats.ordered)
        assert_series_equal(result, exp)

        codes = np.array([0, 0, 0, 1, 1, 1, 3, 3, 3])
        cats = Categorical.from_codes(codes, [0, 1, 2, 3], ordered=True)

        result = data.groupby(cats).mean()
        exp = data.groupby(codes).mean().reindex(cats.categories)
        exp.index = CategoricalIndex(exp.index,
                                     categories=cats.categories,
                                     ordered=cats.ordered)
        assert_series_equal(result, exp)

        cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"],
                           categories=["a", "b", "c", "d"],
                           ordered=True)
        data = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 4, 5], "b": cats})

        result = data.groupby("b").mean()
        result = result["a"].values
        exp = np.array([1, 2, 4, np.nan])
        self.assert_numpy_array_equal(result, exp)
コード例 #3
0
ファイル: test_filters.py プロジェクト: zmyer/pandas
    def test_filter_against_workaround(self):
        np.random.seed(0)
        # Series of ints
        s = Series(np.random.randint(0, 100, 1000))
        grouper = s.apply(lambda x: np.round(x, -1))
        grouped = s.groupby(grouper)
        f = lambda x: x.mean() > 10

        old_way = s[grouped.transform(f).astype('bool')]
        new_way = grouped.filter(f)
        assert_series_equal(new_way.sort_values(), old_way.sort_values())

        # Series of floats
        s = 100 * Series(np.random.random(1000))
        grouper = s.apply(lambda x: np.round(x, -1))
        grouped = s.groupby(grouper)
        f = lambda x: x.mean() > 10
        old_way = s[grouped.transform(f).astype('bool')]
        new_way = grouped.filter(f)
        assert_series_equal(new_way.sort_values(), old_way.sort_values())

        # Set up DataFrame of ints, floats, strings.
        from string import ascii_lowercase
        letters = np.array(list(ascii_lowercase))
        N = 1000
        random_letters = letters.take(np.random.randint(0, 26, N))
        df = DataFrame({
            'ints': Series(np.random.randint(0, 100, N)),
            'floats': N / 10 * Series(np.random.random(N)),
            'letters': Series(random_letters)
        })

        # Group by ints; filter on floats.
        grouped = df.groupby('ints')
        old_way = df[grouped.floats.transform(
            lambda x: x.mean() > N / 20).astype('bool')]
        new_way = grouped.filter(lambda x: x['floats'].mean() > N / 20)
        assert_frame_equal(new_way, old_way)

        # Group by floats (rounded); filter on strings.
        grouper = df.floats.apply(lambda x: np.round(x, -1))
        grouped = df.groupby(grouper)
        old_way = df[grouped.letters.transform(
            lambda x: len(x) < N / 10).astype('bool')]
        new_way = grouped.filter(lambda x: len(x.letters) < N / 10)
        assert_frame_equal(new_way, old_way)

        # Group by strings; filter on ints.
        grouped = df.groupby('letters')
        old_way = df[grouped.ints.transform(
            lambda x: x.mean() > N / 20).astype('bool')]
        new_way = grouped.filter(lambda x: x['ints'].mean() > N / 20)
        assert_frame_equal(new_way, old_way)
コード例 #4
0
ファイル: test_filters.py プロジェクト: AllenDowney/pandas
    def test_filter_against_workaround(self):
        np.random.seed(0)
        # Series of ints
        s = Series(np.random.randint(0, 100, 1000))
        grouper = s.apply(lambda x: np.round(x, -1))
        grouped = s.groupby(grouper)
        f = lambda x: x.mean() > 10

        old_way = s[grouped.transform(f).astype('bool')]
        new_way = grouped.filter(f)
        assert_series_equal(new_way.sort_values(), old_way.sort_values())

        # Series of floats
        s = 100 * Series(np.random.random(1000))
        grouper = s.apply(lambda x: np.round(x, -1))
        grouped = s.groupby(grouper)
        f = lambda x: x.mean() > 10
        old_way = s[grouped.transform(f).astype('bool')]
        new_way = grouped.filter(f)
        assert_series_equal(new_way.sort_values(), old_way.sort_values())

        # Set up DataFrame of ints, floats, strings.
        from string import ascii_lowercase
        letters = np.array(list(ascii_lowercase))
        N = 1000
        random_letters = letters.take(np.random.randint(0, 26, N))
        df = DataFrame({'ints': Series(np.random.randint(0, 100, N)),
                        'floats': N / 10 * Series(np.random.random(N)),
                        'letters': Series(random_letters)})

        # Group by ints; filter on floats.
        grouped = df.groupby('ints')
        old_way = df[grouped.floats.
                     transform(lambda x: x.mean() > N / 20).astype('bool')]
        new_way = grouped.filter(lambda x: x['floats'].mean() > N / 20)
        assert_frame_equal(new_way, old_way)

        # Group by floats (rounded); filter on strings.
        grouper = df.floats.apply(lambda x: np.round(x, -1))
        grouped = df.groupby(grouper)
        old_way = df[grouped.letters.
                     transform(lambda x: len(x) < N / 10).astype('bool')]
        new_way = grouped.filter(lambda x: len(x.letters) < N / 10)
        assert_frame_equal(new_way, old_way)

        # Group by strings; filter on ints.
        grouped = df.groupby('letters')
        old_way = df[grouped.ints.
                     transform(lambda x: x.mean() > N / 20).astype('bool')]
        new_way = grouped.filter(lambda x: x['ints'].mean() > N / 20)
        assert_frame_equal(new_way, old_way)
コード例 #5
0
    def _init_spmatrix(self,
                       data,
                       index,
                       columns,
                       dtype=None,
                       fill_value=None):
        """ Init self from scipy.sparse matrix """
        index, columns = self._prep_index(data, index, columns)
        data = data.tocoo()
        N = len(index)

        # Construct a dict of SparseSeries
        sdict = {}
        values = Series(data.data, index=data.row, copy=False)
        for col, rowvals in values.groupby(data.col):
            # get_blocks expects int32 row indices in sorted order
            rowvals = rowvals.sort_index()
            rows = rowvals.index.values.astype(np.int32)
            blocs, blens = get_blocks(rows)

            sdict[columns[col]] = SparseSeries(rowvals.values,
                                               index=index,
                                               fill_value=fill_value,
                                               sparse_index=BlockIndex(
                                                   N, blocs, blens))

        # Add any columns that were empty and thus not grouped on above
        sdict.update({
            column: SparseSeries(index=index,
                                 fill_value=fill_value,
                                 sparse_index=BlockIndex(N, [], []))
            for column in columns if column not in sdict
        })

        return self._init_dict(sdict, index, columns, dtype)
コード例 #6
0
ファイル: frame.py プロジェクト: sechilds/pandas
    def _init_spmatrix(self, data, index, columns, dtype=None,
                       fill_value=None):
        """ Init self from scipy.sparse matrix """
        index, columns = self._prep_index(data, index, columns)
        data = data.tocoo()
        N = len(index)

        # Construct a dict of SparseSeries
        sdict = {}
        values = Series(data.data, index=data.row, copy=False)
        for col, rowvals in values.groupby(data.col):
            # get_blocks expects int32 row indices in sorted order
            rowvals = rowvals.sort_index()
            rows = rowvals.index.values.astype(np.int32)
            blocs, blens = get_blocks(rows)

            sdict[columns[col]] = SparseSeries(
                rowvals.values, index=index,
                fill_value=fill_value,
                sparse_index=BlockIndex(N, blocs, blens))

        # Add any columns that were empty and thus not grouped on above
        sdict.update({column: SparseSeries(index=index,
                                           fill_value=fill_value,
                                           sparse_index=BlockIndex(N, [], []))
                      for column in columns
                      if column not in sdict})

        return self._init_dict(sdict, index, columns, dtype)
コード例 #7
0
    def test_groupby_categorical_unequal_len(self):
        # GH3011
        series = Series([np.nan, np.nan, 1, 1, 2, 2, 3, 3, 4, 4])
        # The raises only happens with categorical, not with series of types
        # category
        bins = pd.cut(series.dropna().values, 4)

        # len(bins) != len(series) here
        self.assertRaises(ValueError, lambda: series.groupby(bins).mean())
コード例 #8
0
ファイル: test_categorical.py プロジェクト: sunyuzheng/pandas
    def test_groupby_categorical_unequal_len(self):
        # GH3011
        series = Series([np.nan, np.nan, 1, 1, 2, 2, 3, 3, 4, 4])
        # The raises only happens with categorical, not with series of types
        # category
        bins = pd.cut(series.dropna().values, 4)

        # len(bins) != len(series) here
        self.assertRaises(ValueError, lambda: series.groupby(bins).mean())