Exemple #1
0
    def test_filter_maintains_ordering(self):
        # Simple case: index is sequential. #4621
        df = DataFrame({
            'pid': [1, 1, 1, 2, 2, 3, 3, 3],
            'tag': [23, 45, 62, 24, 45, 34, 25, 62]
        })
        s = df['pid']
        grouped = df.groupby('tag')
        actual = grouped.filter(lambda x: len(x) > 1)
        expected = df.iloc[[1, 2, 4, 7]]
        assert_frame_equal(actual, expected)

        grouped = s.groupby(df['tag'])
        actual = grouped.filter(lambda x: len(x) > 1)
        expected = s.iloc[[1, 2, 4, 7]]
        assert_series_equal(actual, expected)

        # Now index is sequentially decreasing.
        df.index = np.arange(len(df) - 1, -1, -1)
        s = df['pid']
        grouped = df.groupby('tag')
        actual = grouped.filter(lambda x: len(x) > 1)
        expected = df.iloc[[1, 2, 4, 7]]
        assert_frame_equal(actual, expected)

        grouped = s.groupby(df['tag'])
        actual = grouped.filter(lambda x: len(x) > 1)
        expected = s.iloc[[1, 2, 4, 7]]
        assert_series_equal(actual, expected)

        # Index is shuffled.
        SHUFFLED = [4, 6, 7, 2, 1, 0, 5, 3]
        df.index = df.index[SHUFFLED]
        s = df['pid']
        grouped = df.groupby('tag')
        actual = grouped.filter(lambda x: len(x) > 1)
        expected = df.iloc[[1, 2, 4, 7]]
        assert_frame_equal(actual, expected)

        grouped = s.groupby(df['tag'])
        actual = grouped.filter(lambda x: len(x) > 1)
        expected = s.iloc[[1, 2, 4, 7]]
        assert_series_equal(actual, expected)
Exemple #2
0
    def test_groupby_multi_categorical_as_index(self):
        # GH13204
        df = DataFrame({"cat": Categorical([1, 2, 2], [1, 2, 3]), "A": [10, 11, 11], "B": [101, 102, 103]})
        result = df.groupby(["cat", "A"], as_index=False).sum()
        expected = DataFrame(
            {
                "cat": Categorical([1, 1, 2, 2, 3, 3]),
                "A": [10, 11, 10, 11, 10, 11],
                "B": [101.0, nan, nan, 205.0, nan, nan],
            },
            columns=["cat", "A", "B"],
        )
        tm.assert_frame_equal(result, expected)

        # function grouper
        f = lambda r: df.loc[r, "A"]
        result = df.groupby(["cat", f], as_index=False).sum()
        expected = DataFrame(
            {
                "cat": Categorical([1, 1, 2, 2, 3, 3]),
                "A": [10.0, nan, nan, 22.0, nan, nan],
                "B": [101.0, nan, nan, 205.0, nan, nan],
            },
            columns=["cat", "A", "B"],
        )
        tm.assert_frame_equal(result, expected)

        # another not in-axis grouper (conflicting names in index)
        s = Series(["a", "b", "b"], name="cat")
        result = df.groupby(["cat", s], as_index=False).sum()
        expected = DataFrame(
            {
                "cat": Categorical([1, 1, 2, 2, 3, 3]),
                "A": [10.0, nan, nan, 22.0, nan, nan],
                "B": [101.0, nan, nan, 205.0, nan, nan],
            },
            columns=["cat", "A", "B"],
        )
        tm.assert_frame_equal(result, expected)

        # is original index dropped?
        expected = DataFrame(
            {
                "cat": Categorical([1, 1, 2, 2, 3, 3]),
                "A": [10, 11, 10, 11, 10, 11],
                "B": [101.0, nan, nan, 205.0, nan, nan],
            },
            columns=["cat", "A", "B"],
        )

        for name in [None, "X", "B", "cat"]:
            df.index = Index(list("abc"), name=name)
            result = df.groupby(["cat", "A"], as_index=False).sum()
            tm.assert_frame_equal(result, expected, check_index_type=True)
Exemple #3
0
    def test_filter_maintains_ordering(self):
        # Simple case: index is sequential. #4621
        df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3],
                        'tag': [23, 45, 62, 24, 45, 34, 25, 62]})
        s = df['pid']
        grouped = df.groupby('tag')
        actual = grouped.filter(lambda x: len(x) > 1)
        expected = df.iloc[[1, 2, 4, 7]]
        assert_frame_equal(actual, expected)

        grouped = s.groupby(df['tag'])
        actual = grouped.filter(lambda x: len(x) > 1)
        expected = s.iloc[[1, 2, 4, 7]]
        assert_series_equal(actual, expected)

        # Now index is sequentially decreasing.
        df.index = np.arange(len(df) - 1, -1, -1)
        s = df['pid']
        grouped = df.groupby('tag')
        actual = grouped.filter(lambda x: len(x) > 1)
        expected = df.iloc[[1, 2, 4, 7]]
        assert_frame_equal(actual, expected)

        grouped = s.groupby(df['tag'])
        actual = grouped.filter(lambda x: len(x) > 1)
        expected = s.iloc[[1, 2, 4, 7]]
        assert_series_equal(actual, expected)

        # Index is shuffled.
        SHUFFLED = [4, 6, 7, 2, 1, 0, 5, 3]
        df.index = df.index[SHUFFLED]
        s = df['pid']
        grouped = df.groupby('tag')
        actual = grouped.filter(lambda x: len(x) > 1)
        expected = df.iloc[[1, 2, 4, 7]]
        assert_frame_equal(actual, expected)

        grouped = s.groupby(df['tag'])
        actual = grouped.filter(lambda x: len(x) > 1)
        expected = s.iloc[[1, 2, 4, 7]]
        assert_series_equal(actual, expected)
    def test_groupby_multi_categorical_as_index(self):
        # GH13204
        df = DataFrame({'cat': Categorical([1, 2, 2], [1, 2, 3]),
                        'A': [10, 11, 11],
                        'B': [101, 102, 103]})
        result = df.groupby(['cat', 'A'], as_index=False).sum()
        expected = DataFrame({'cat': Categorical([1, 1, 2, 2, 3, 3]),
                              'A': [10, 11, 10, 11, 10, 11],
                              'B': [101.0, nan, nan, 205.0, nan, nan]},
                             columns=['cat', 'A', 'B'])
        tm.assert_frame_equal(result, expected)

        # function grouper
        f = lambda r: df.loc[r, 'A']
        result = df.groupby(['cat', f], as_index=False).sum()
        expected = DataFrame({'cat': Categorical([1, 1, 2, 2, 3, 3]),
                              'A': [10.0, nan, nan, 22.0, nan, nan],
                              'B': [101.0, nan, nan, 205.0, nan, nan]},
                             columns=['cat', 'A', 'B'])
        tm.assert_frame_equal(result, expected)

        # another not in-axis grouper (conflicting names in index)
        s = Series(['a', 'b', 'b'], name='cat')
        result = df.groupby(['cat', s], as_index=False).sum()
        expected = DataFrame({'cat': Categorical([1, 1, 2, 2, 3, 3]),
                              'A': [10.0, nan, nan, 22.0, nan, nan],
                              'B': [101.0, nan, nan, 205.0, nan, nan]},
                             columns=['cat', 'A', 'B'])
        tm.assert_frame_equal(result, expected)

        # is original index dropped?
        expected = DataFrame({'cat': Categorical([1, 1, 2, 2, 3, 3]),
                              'A': [10, 11, 10, 11, 10, 11],
                              'B': [101.0, nan, nan, 205.0, nan, nan]},
                             columns=['cat', 'A', 'B'])

        group_columns = ['cat', 'A']

        for name in [None, 'X', 'B', 'cat']:
            df.index = Index(list("abc"), name=name)

            if name in group_columns and name in df.index.names:
                with tm.assert_produces_warning(FutureWarning,
                                                check_stacklevel=False):
                    result = df.groupby(group_columns, as_index=False).sum()

            else:
                result = df.groupby(group_columns, as_index=False).sum()

            tm.assert_frame_equal(result, expected, check_index_type=True)
    def test_groupby_multi_categorical_as_index(self):
        # GH13204
        df = DataFrame({
            'cat': Categorical([1, 2, 2], [1, 2, 3]),
            'A': [10, 11, 11],
            'B': [101, 102, 103]
        })
        result = df.groupby(['cat', 'A'], as_index=False).sum()
        expected = DataFrame(
            {
                'cat': Categorical([1, 1, 2, 2, 3, 3]),
                'A': [10, 11, 10, 11, 10, 11],
                'B': [101.0, nan, nan, 205.0, nan, nan]
            },
            columns=['cat', 'A', 'B'])
        tm.assert_frame_equal(result, expected)

        # function grouper
        f = lambda r: df.loc[r, 'A']
        result = df.groupby(['cat', f], as_index=False).sum()
        expected = DataFrame(
            {
                'cat': Categorical([1, 1, 2, 2, 3, 3]),
                'A': [10.0, nan, nan, 22.0, nan, nan],
                'B': [101.0, nan, nan, 205.0, nan, nan]
            },
            columns=['cat', 'A', 'B'])
        tm.assert_frame_equal(result, expected)

        # another not in-axis grouper (conflicting names in index)
        s = Series(['a', 'b', 'b'], name='cat')
        result = df.groupby(['cat', s], as_index=False).sum()
        expected = DataFrame(
            {
                'cat': Categorical([1, 1, 2, 2, 3, 3]),
                'A': [10.0, nan, nan, 22.0, nan, nan],
                'B': [101.0, nan, nan, 205.0, nan, nan]
            },
            columns=['cat', 'A', 'B'])
        tm.assert_frame_equal(result, expected)

        # is original index dropped?
        expected = DataFrame(
            {
                'cat': Categorical([1, 1, 2, 2, 3, 3]),
                'A': [10, 11, 10, 11, 10, 11],
                'B': [101.0, nan, nan, 205.0, nan, nan]
            },
            columns=['cat', 'A', 'B'])

        group_columns = ['cat', 'A']

        for name in [None, 'X', 'B', 'cat']:
            df.index = Index(list("abc"), name=name)

            if name in group_columns and name in df.index.names:
                with tm.assert_produces_warning(FutureWarning,
                                                check_stacklevel=False):
                    result = df.groupby(group_columns, as_index=False).sum()

            else:
                result = df.groupby(group_columns, as_index=False).sum()

            tm.assert_frame_equal(result, expected, check_index_type=True)