Beispiel #1
0
    def test_size(self):
        grouped = self.df.groupby(['A', 'B'])
        result = grouped.size()
        for key, group in grouped:
            assert result[key] == len(group)

        grouped = self.df.groupby('A')
        result = grouped.size()
        for key, group in grouped:
            assert result[key] == len(group)

        grouped = self.df.groupby('B')
        result = grouped.size()
        for key, group in grouped:
            assert result[key] == len(group)

        df = DataFrame(np.random.choice(20, (1000, 3)), columns=list('abc'))
        for sort, key in cart_product((False, True), ('a', 'b', ['a', 'b'])):
            left = df.groupby(key, sort=sort).size()
            right = df.groupby(key, sort=sort)['c'].apply(lambda a: a.shape[0])
            assert_series_equal(left, right, check_names=False)

        # GH11699
        df = DataFrame([], columns=['A', 'B'])
        out = Series([], dtype='int64', index=Index([], name='A'))
        assert_series_equal(df.groupby('A').size(), out)
Beispiel #2
0
def test_size(df):
    grouped = df.groupby(['A', 'B'])
    result = grouped.size()
    for key, group in grouped:
        assert result[key] == len(group)

    grouped = df.groupby('A')
    result = grouped.size()
    for key, group in grouped:
        assert result[key] == len(group)

    grouped = df.groupby('B')
    result = grouped.size()
    for key, group in grouped:
        assert result[key] == len(group)

    df = DataFrame(np.random.choice(20, (1000, 3)), columns=list('abc'))
    for sort, key in cart_product((False, True), ('a', 'b', ['a', 'b'])):
        left = df.groupby(key, sort=sort).size()
        right = df.groupby(key, sort=sort)['c'].apply(lambda a: a.shape[0])
        tm.assert_series_equal(left, right, check_names=False)

    # GH11699
    df = DataFrame([], columns=['A', 'B'])
    out = Series([], dtype='int64', index=Index([], name='A'))
    tm.assert_series_equal(df.groupby('A').size(), out)
Beispiel #3
0
    def test_xs_level_multiple(self):
        text = """                      A       B       C       D        E
one two three   four
a   b   10.0032 5    -0.5109 -2.3358 -0.4645  0.05076  0.3640
a   q   20      4     0.4473  1.4152  0.2834  1.00661  0.1744
x   q   30      3    -0.6662 -0.5243 -0.3580  0.89145  2.5838"""

        df = read_csv(StringIO(text), sep=r'\s+', engine='python')

        result = df.xs(('a', 4), level=['one', 'four'])
        expected = df.xs('a').xs(4, level='four')
        tm.assert_frame_equal(result, expected)

        # this is a copy in 0.14
        result = df.xs(('a', 4), level=['one', 'four'])

        # setting this will give a SettingWithCopyError
        # as we are trying to write a view
        def f(x):
            x[:] = 10

        pytest.raises(com.SettingWithCopyError, f, result)

        # GH2107
        dates = lrange(20111201, 20111205)
        ids = 'abcde'
        idx = MultiIndex.from_tuples([x for x in cart_product(dates, ids)])
        idx.names = ['date', 'secid']
        df = DataFrame(np.random.randn(len(idx), 3), idx, ['X', 'Y', 'Z'])

        rs = df.xs(20111201, level='date')
        xp = df.loc[20111201, :]
        tm.assert_frame_equal(rs, xp)
    def test_series_groupby_nunique(self):
        def check_nunique(df, keys, as_index=True):
            for sort, dropna in cart_product((False, True), repeat=2):
                gr = df.groupby(keys, as_index=as_index, sort=sort)
                left = gr['julie'].nunique(dropna=dropna)

                gr = df.groupby(keys, as_index=as_index, sort=sort)
                right = gr['julie'].apply(Series.nunique, dropna=dropna)
                if not as_index:
                    right = right.reset_index(drop=True)

                assert_series_equal(left, right, check_names=False)

        days = date_range('2015-08-23', periods=10)

        for n, m in cart_product(10**np.arange(2, 6), (10, 100, 1000)):
            frame = DataFrame({
                'jim': np.random.choice(list(ascii_lowercase), n),
                'joe': np.random.choice(days, n),
                'julie': np.random.randint(0, m, n)
            })

            check_nunique(frame, ['jim'])
            check_nunique(frame, ['jim', 'joe'])

            frame.loc[1::17, 'jim'] = None
            frame.loc[3::37, 'joe'] = None
            frame.loc[7::19, 'julie'] = None
            frame.loc[8::19, 'julie'] = None
            frame.loc[9::19, 'julie'] = None

            check_nunique(frame, ['jim'])
            check_nunique(frame, ['jim', 'joe'])
            check_nunique(frame, ['jim'], as_index=False)
            check_nunique(frame, ['jim', 'joe'], as_index=False)
Beispiel #5
0
    def test_xs_level_multiple(self):
        text = """                      A       B       C       D        E
one two three   four
a   b   10.0032 5    -0.5109 -2.3358 -0.4645  0.05076  0.3640
a   q   20      4     0.4473  1.4152  0.2834  1.00661  0.1744
x   q   30      3    -0.6662 -0.5243 -0.3580  0.89145  2.5838"""

        df = read_csv(StringIO(text), sep=r'\s+', engine='python')

        result = df.xs(('a', 4), level=['one', 'four'])
        expected = df.xs('a').xs(4, level='four')
        tm.assert_frame_equal(result, expected)

        # this is a copy in 0.14
        result = df.xs(('a', 4), level=['one', 'four'])

        # setting this will give a SettingWithCopyError
        # as we are trying to write a view
        def f(x):
            x[:] = 10

        pytest.raises(com.SettingWithCopyError, f, result)

        # GH2107
        dates = lrange(20111201, 20111205)
        ids = 'abcde'
        idx = MultiIndex.from_tuples([x for x in cart_product(dates, ids)])
        idx.names = ['date', 'secid']
        df = DataFrame(np.random.randn(len(idx), 3), idx, ['X', 'Y', 'Z'])

        rs = df.xs(20111201, level='date')
        xp = df.loc[20111201, :]
        tm.assert_frame_equal(rs, xp)
Beispiel #6
0
        def check_nunique(df, keys, as_index=True):
            for sort, dropna in cart_product((False, True), repeat=2):
                gr = df.groupby(keys, as_index=as_index, sort=sort)
                left = gr['julie'].nunique(dropna=dropna)

                gr = df.groupby(keys, as_index=as_index, sort=sort)
                right = gr['julie'].apply(Series.nunique, dropna=dropna)
                if not as_index:
                    right = right.reset_index(drop=True)

                assert_series_equal(left, right, check_names=False)
        def check_nunique(df, keys, as_index=True):
            for sort, dropna in cart_product((False, True), repeat=2):
                gr = df.groupby(keys, as_index=as_index, sort=sort)
                left = gr['julie'].nunique(dropna=dropna)

                gr = df.groupby(keys, as_index=as_index, sort=sort)
                right = gr['julie'].apply(Series.nunique, dropna=dropna)
                if not as_index:
                    right = right.reset_index(drop=True)

                assert_series_equal(left, right, check_names=False)
Beispiel #8
0
def test_xs_integer_key():
    # see gh-2107
    dates = lrange(20111201, 20111205)
    ids = 'abcde'
    index = MultiIndex.from_tuples([x for x in cart_product(dates, ids)],
                                   names=['date', 'secid'])
    df = DataFrame(np.random.randn(len(index), 3), index, ['X', 'Y', 'Z'])

    result = df.xs(20111201, level='date')
    expected = df.loc[20111201, :]
    tm.assert_frame_equal(result, expected)
    def test_ngroup_cumcount_pair(self):
        # brute force comparison for all small series
        for p in cart_product(range(3), repeat=4):
            df = DataFrame({'a': p})
            g = df.groupby(['a'])

            order = sorted(set(p))
            ngroupd = [order.index(val) for val in p]
            cumcounted = [p[:i].count(val) for i, val in enumerate(p)]

            assert_series_equal(g.ngroup(), Series(ngroupd))
            assert_series_equal(g.cumcount(), Series(cumcounted))
Beispiel #10
0
def test_xs_integer_key():
    # see gh-2107
    dates = lrange(20111201, 20111205)
    ids = 'abcde'
    index = MultiIndex.from_tuples(
        [x for x in cart_product(dates, ids)],
        names=['date', 'secid'])
    df = DataFrame(
        np.random.randn(len(index), 3), index, ['X', 'Y', 'Z'])

    result = df.xs(20111201, level='date')
    expected = df.loc[20111201, :]
    tm.assert_frame_equal(result, expected)
Beispiel #11
0
    def test_series_groupby_nunique(self):

        def check_nunique(df, keys, as_index=True):
            for sort, dropna in cart_product((False, True), repeat=2):
                gr = df.groupby(keys, as_index=as_index, sort=sort)
                left = gr['julie'].nunique(dropna=dropna)

                gr = df.groupby(keys, as_index=as_index, sort=sort)
                right = gr['julie'].apply(Series.nunique, dropna=dropna)
                if not as_index:
                    right = right.reset_index(drop=True)

                assert_series_equal(left, right, check_names=False)

        days = date_range('2015-08-23', periods=10)

        for n, m in cart_product(10 ** np.arange(2, 6), (10, 100, 1000)):
            frame = DataFrame({
                'jim': np.random.choice(
                    list(ascii_lowercase), n),
                'joe': np.random.choice(days, n),
                'julie': np.random.randint(0, m, n)
            })

            check_nunique(frame, ['jim'])
            check_nunique(frame, ['jim', 'joe'])

            frame.loc[1::17, 'jim'] = None
            frame.loc[3::37, 'joe'] = None
            frame.loc[7::19, 'julie'] = None
            frame.loc[8::19, 'julie'] = None
            frame.loc[9::19, 'julie'] = None

            check_nunique(frame, ['jim'])
            check_nunique(frame, ['jim', 'joe'])
            check_nunique(frame, ['jim'], as_index=False)
            check_nunique(frame, ['jim', 'joe'], as_index=False)
Beispiel #12
0
        df[df.PRICE == 24990].VOLUME.describe().values.tolist(),
        df[df.PRICE == 25499].VOLUME.describe().values.tolist()
    ]
    expected = pd.DataFrame(
        data,
        index=pd.Index([24990, 25499], name='PRICE'),
        columns=['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'])
    tm.assert_frame_equal(result, expected)


# nunique
# --------------------------------


@pytest.mark.parametrize("n, m",
                         cart_product(10**np.arange(2, 6), (10, 100, 1000)))
@pytest.mark.parametrize("sort, dropna", cart_product((False, True), repeat=2))
def test_series_groupby_nunique(n, m, sort, dropna):
    def check_nunique(df, keys, as_index=True):
        gr = df.groupby(keys, as_index=as_index, sort=sort)
        left = gr['julie'].nunique(dropna=dropna)

        gr = df.groupby(keys, as_index=as_index, sort=sort)
        right = gr['julie'].apply(Series.nunique, dropna=dropna)
        if not as_index:
            right = right.reset_index(drop=True)

        tm.assert_series_equal(left, right, check_names=False)

    days = date_range('2015-08-23', periods=10)
Beispiel #13
0
    df = pd.DataFrame({'PRICE': prices,
                       'VOLUME': volumes})
    result = df.groupby('PRICE').VOLUME.describe()
    data = [df[df.PRICE == 24990].VOLUME.describe().values.tolist(),
            df[df.PRICE == 25499].VOLUME.describe().values.tolist()]
    expected = pd.DataFrame(data,
                            index=pd.Index([24990, 25499], name='PRICE'),
                            columns=['count', 'mean', 'std', 'min',
                                     '25%', '50%', '75%', 'max'])
    tm.assert_frame_equal(result, expected)


# nunique
# --------------------------------

@pytest.mark.parametrize("n, m", cart_product(10 ** np.arange(2, 6),
                                              (10, 100, 1000)))
@pytest.mark.parametrize("sort, dropna", cart_product((False, True), repeat=2))
def test_series_groupby_nunique(n, m, sort, dropna):

    def check_nunique(df, keys, as_index=True):
        gr = df.groupby(keys, as_index=as_index, sort=sort)
        left = gr['julie'].nunique(dropna=dropna)

        gr = df.groupby(keys, as_index=as_index, sort=sort)
        right = gr['julie'].apply(Series.nunique, dropna=dropna)
        if not as_index:
            right = right.reset_index(drop=True)

        tm.assert_series_equal(left, right, check_names=False)

    days = date_range('2015-08-23', periods=10)