Example #1
0
class Size(object):

    def setup(self):
        n = 10**5
        offsets = np.random.randint(n, size=n).astype('timedelta64[ns]')
        dates = np.datetime64('now') + offsets
        self.df = DataFrame({'key1': np.random.randint(0, 500, size=n),
                             'key2': np.random.randint(0, 100, size=n),
                             'value1': np.random.randn(n),
                             'value2': np.random.randn(n),
                             'value3': np.random.randn(n),
                             'dates': dates})
        self.draws = Series(np.random.randn(n))
        labels = Series(['foo', 'bar', 'baz', 'qux'] * (n // 4))
        self.cats = labels.astype('category')

    def time_multi_size(self):
        self.df.groupby(['key1', 'key2']).size()

    def time_dt_timegrouper_size(self):
        with warnings.catch_warnings(record=True):
            self.df.groupby(TimeGrouper(key='dates', freq='M')).size()

    def time_category_size(self):
        self.draws.groupby(self.cats).size()
Example #2
0
    def test_groupby_categorical_no_compress(self):
        data = Series(np.random.randn(9))

        codes = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2])
        cats = Categorical.from_codes(codes, [0, 1, 2], ordered=True)

        result = data.groupby(cats).mean()
        exp = data.groupby(codes).mean()

        exp.index = CategoricalIndex(exp.index, categories=cats.categories,
                                     ordered=cats.ordered)
        assert_series_equal(result, exp)

        codes = np.array([0, 0, 0, 1, 1, 1, 3, 3, 3])
        cats = Categorical.from_codes(codes, [0, 1, 2, 3], ordered=True)

        result = data.groupby(cats).mean()
        exp = data.groupby(codes).mean().reindex(cats.categories)
        exp.index = CategoricalIndex(exp.index, categories=cats.categories,
                                     ordered=cats.ordered)
        assert_series_equal(result, exp)

        cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"],
                           categories=["a", "b", "c", "d"], ordered=True)
        data = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 4, 5], "b": cats})

        result = data.groupby("b").mean()
        result = result["a"].values
        exp = np.array([1, 2, 4, np.nan])
        tm.assert_numpy_array_equal(result, exp)
Example #3
0
class ApplyDictReturn(object):
    def setup(self):
        self.labels = np.arange(1000).repeat(10)
        self.data = Series(np.random.randn(len(self.labels)))

    def time_groupby_apply_dict_return(self):
        self.data.groupby(self.labels).apply(lambda x: {'first': x.values[0],
                                                        'last': x.values[-1]})
def test_bins_unequal_len():
    # GH3011
    series = Series([np.nan, np.nan, 1, 1, 2, 2, 3, 3, 4, 4])
    bins = pd.cut(series.dropna().values, 4)

    # len(bins) != len(series) here
    with pytest.raises(ValueError):
        series.groupby(bins).mean()
Example #5
0
class TestTimeGrouper(unittest.TestCase):

    def setUp(self):
        self.ts = Series(np.random.randn(1000),
                         index=date_range('1/1/2000', periods=1000))

    def test_apply(self):
        grouper = TimeGrouper('A', label='right', closed='right')

        grouped = self.ts.groupby(grouper)

        f = lambda x: x.order()[-3:]

        applied = grouped.apply(f)
        expected = self.ts.groupby(lambda x: x.year).apply(f)

        applied.index = applied.index.droplevel(0)
        expected.index = expected.index.droplevel(0)
        assert_series_equal(applied, expected)

    def test_count(self):
        self.ts[::3] = np.nan

        grouper = TimeGrouper('A', label='right', closed='right')
        result = self.ts.resample('A', how='count')

        expected = self.ts.groupby(lambda x: x.year).count()
        expected.index = result.index

        assert_series_equal(result, expected)

    def test_numpy_reduction(self):
        result = self.ts.resample('A', how='prod', closed='right')

        expected = self.ts.groupby(lambda x: x.year).agg(np.prod)
        expected.index = result.index

        assert_series_equal(result, expected)

    def test_apply_iteration(self):
        # #2300
        N = 1000
        ind = pd.date_range(start="2000-01-01", freq="D", periods=N)
        df = DataFrame({'open':1, 'close':2}, index=ind)
        tg = TimeGrouper('M')

        grouper = tg.get_grouper(df)

        # Errors

        grouped = df.groupby(grouper, group_keys=False)
        f = lambda df: df['close'] / df['open']

        # it works!
        result = grouped.apply(f)
        self.assertTrue(result.index.equals(df.index))
Example #6
0
class TestTimeSeriesDuplicates(unittest.TestCase):
    def setUp(self):
        dates = [
            datetime(2000, 1, 2),
            datetime(2000, 1, 2),
            datetime(2000, 1, 2),
            datetime(2000, 1, 3),
            datetime(2000, 1, 3),
            datetime(2000, 1, 3),
            datetime(2000, 1, 4),
            datetime(2000, 1, 4),
            datetime(2000, 1, 4),
            datetime(2000, 1, 5),
        ]

        self.dups = Series(np.random.randn(len(dates)), index=dates)

    def test_constructor(self):
        self.assert_(isinstance(self.dups, TimeSeries))
        self.assert_(isinstance(self.dups.index, DatetimeIndex))

    def test_is_unique_monotonic(self):
        self.assert_(not self.dups.index.is_unique)

    def test_index_unique(self):
        uniques = self.dups.index.unique()
        self.assert_(uniques.dtype == "M8")  # sanity

    def test_duplicate_dates_indexing(self):
        ts = self.dups

        uniques = ts.index.unique()

        for date in uniques:
            result = ts[date]

            mask = ts.index == date
            total = (ts.index == date).sum()
            expected = ts[mask]
            if total > 1:
                assert_series_equal(result, expected)
            else:
                assert_almost_equal(result, expected[0])

            cp = ts.copy()
            cp[date] = 0
            expected = Series(np.where(mask, 0, ts), index=ts.index)
            assert_series_equal(cp, expected)

        self.assertRaises(KeyError, ts.__getitem__, datetime(2000, 1, 6))
        self.assertRaises(KeyError, ts.__setitem__, datetime(2000, 1, 6), 0)

    def test_groupby_average_dup_values(self):
        result = self.dups.groupby(level=0).mean()
        expected = self.dups.groupby(self.dups.index).mean()
        assert_series_equal(result, expected)
Example #7
0
    def test_groupby_count_dateparseerror(self):
        dr = date_range(start='1/1/2012', freq='5min', periods=10)

        # BAD Example, datetimes first
        s = Series(np.arange(10), index=[dr, lrange(10)])
        grouped = s.groupby(lambda x: x[1] % 2 == 0)
        result = grouped.count()

        s = Series(np.arange(10), index=[lrange(10), dr])
        grouped = s.groupby(lambda x: x[0] % 2 == 0)
        expected = grouped.count()

        assert_series_equal(result, expected)
Example #8
0
def test_filter_against_workaround():
    np.random.seed(0)
    # Series of ints
    s = Series(np.random.randint(0, 100, 1000))
    grouper = s.apply(lambda x: np.round(x, -1))
    grouped = s.groupby(grouper)
    f = lambda x: x.mean() > 10

    old_way = s[grouped.transform(f).astype('bool')]
    new_way = grouped.filter(f)
    tm.assert_series_equal(new_way.sort_values(), old_way.sort_values())

    # Series of floats
    s = 100 * Series(np.random.random(1000))
    grouper = s.apply(lambda x: np.round(x, -1))
    grouped = s.groupby(grouper)
    f = lambda x: x.mean() > 10
    old_way = s[grouped.transform(f).astype('bool')]
    new_way = grouped.filter(f)
    tm.assert_series_equal(new_way.sort_values(), old_way.sort_values())

    # Set up DataFrame of ints, floats, strings.
    from string import ascii_lowercase
    letters = np.array(list(ascii_lowercase))
    N = 1000
    random_letters = letters.take(np.random.randint(0, 26, N))
    df = DataFrame({'ints': Series(np.random.randint(0, 100, N)),
                    'floats': N / 10 * Series(np.random.random(N)),
                    'letters': Series(random_letters)})

    # Group by ints; filter on floats.
    grouped = df.groupby('ints')
    old_way = df[grouped.floats.
                 transform(lambda x: x.mean() > N / 20).astype('bool')]
    new_way = grouped.filter(lambda x: x['floats'].mean() > N / 20)
    tm.assert_frame_equal(new_way, old_way)

    # Group by floats (rounded); filter on strings.
    grouper = df.floats.apply(lambda x: np.round(x, -1))
    grouped = df.groupby(grouper)
    old_way = df[grouped.letters.
                 transform(lambda x: len(x) < N / 10).astype('bool')]
    new_way = grouped.filter(lambda x: len(x.letters) < N / 10)
    tm.assert_frame_equal(new_way, old_way)

    # Group by strings; filter on ints.
    grouped = df.groupby('letters')
    old_way = df[grouped.ints.
                 transform(lambda x: x.mean() > N / 20).astype('bool')]
    new_way = grouped.filter(lambda x: x['ints'].mean() > N / 20)
    tm.assert_frame_equal(new_way, old_way)
Example #9
0
    def test_count_level_series(self):
        index = MultiIndex(
            levels=[["foo", "bar", "baz"], ["one", "two", "three", "four"]], labels=[[0, 0, 0, 2, 2], [2, 0, 1, 1, 2]]
        )

        s = Series(np.random.randn(len(index)), index=index)

        result = s.count(level=0)
        expected = s.groupby(level=0).count()
        assert_series_equal(result.astype("f8"), expected.reindex(result.index).fillna(0))

        result = s.count(level=1)
        expected = s.groupby(level=1).count()
        assert_series_equal(result.astype("f8"), expected.reindex(result.index).fillna(0))
def date_11():
    from pandas.tseries.offsets import Day,MonthEnd
    now=datetime(2011,11,17)
    print now+3*Day()
    print now+MonthEnd()
    print now+MonthEnd(2)

    offset=MonthEnd()
    print offset.rollforward(now)
    print offset.rollback(now)

    ts=Series(np.random.randn(20),index=pd.date_range('1/15/2000',periods=20,freq='4d'))
    print ts.groupby(offset.rollforward).mean()
    print ts.resample('M',how='mean')
Example #11
0
    def test_groupby_grouper_f_sanity_checked(self):
        dates = date_range('01-Jan-2013', periods=12, freq='MS')
        ts = Series(np.random.randn(12), index=dates)

        # GH3035
        # index.map is used to apply grouper to the index
        # if it fails on the elements, map tries it on the entire index as
        # a sequence. That can yield invalid results that cause trouble
        # down the line.
        # the surprise comes from using key[0:6] rather then str(key)[0:6]
        # when the elements are Timestamp.
        # the result is Index[0:6], very confusing.

        msg = r"Grouper result violates len\(labels\) == len\(data\)"
        with pytest.raises(AssertionError, match=msg):
            ts.groupby(lambda key: key[0:6])
Example #12
0
    def test_hourly(self):
        rng_hourly = date_range('1/1/1994', periods=(18 * 8760 + 4 * 24),
                                freq='H')
        data_hourly = np.random.randint(100, 350, rng_hourly.size)
        ts_hourly = Series(data_hourly, index=rng_hourly)

        grouped = ts_hourly.groupby(ts_hourly.index.year)
        hoy = grouped.apply(lambda x: x.reset_index(drop=True))
        hoy = hoy.index.droplevel(0).values
        hoy[~isleapyear(ts_hourly.index.year) & (hoy >= 1416)] += 24
        hoy += 1

        annual = pivot_annual(ts_hourly)

        ts_hourly = ts_hourly.astype(float)
        for i in [1, 1416, 1417, 1418, 1439, 1440, 1441, 8784]:
            subset = ts_hourly[hoy == i]
            subset.index = [x.year for x in subset.index]

            result = annual[i].dropna()
            tm.assert_series_equal(result, subset, check_names=False)
            self.assertEqual(result.name, i)

        leaps = ts_hourly[(ts_hourly.index.month == 2) & (
            ts_hourly.index.day == 29) & (ts_hourly.index.hour == 0)]
        hour = leaps.index.dayofyear[0] * 24 - 23
        leaps.index = leaps.index.year
        leaps.name = 1417
        tm.assert_series_equal(annual[hour].dropna(), leaps)
Example #13
0
    def test_custom_grouper(self):

        dti = DatetimeIndex(freq="Min", start=datetime(2005, 1, 1), end=datetime(2005, 1, 10))

        data = np.array([1] * len(dti))
        s = Series(data, index=dti)

        b = TimeGrouper(Minute(5))
        g = s.groupby(b)

        # check all cython functions work
        funcs = ["add", "mean", "prod", "ohlc", "min", "max", "var"]
        for f in funcs:
            g._cython_agg_general(f)

        self.assertEquals(g.ngroups, 2593)
        self.assert_(notnull(g.mean()).all())

        # construct expected val
        arr = [5] * 2592
        arr.append(1)
        idx = dti[0:-1:5]
        idx = idx.append(DatetimeIndex([np.datetime64(dti[-1])]))
        expect = Series(arr, index=idx)

        # cython returns float for now
        result = g.agg(np.sum)
        assert_series_equal(result, expect.astype(float))

        data = np.random.rand(len(dti), 10)
        df = DataFrame(data, index=dti)
        r = df.groupby(b).agg(np.sum)

        self.assertEquals(len(r.columns), 10)
        self.assertEquals(len(r.index), 2593)
Example #14
0
def _render_dimensional_metric_cell(row_data: pd.Series, metric: Metric):
    """
    Renders a table cell in a metric column for pivoted tables where there are two or more dimensions. This function
    is recursive to traverse multi-dimensional indices.

    :param row_data:
        A series containing the value for the metric and it's index (for the dimension values).

    :param metric:
        A reference to the slicer metric to access the display formatting.

    :return:
        A deep dict in a tree structure with keys matching each dimension level. The top level will have keys matching
        the first level of dimension values, and the next level will contain the next level of dimension values, for as
        many index levels as there are. The last level will contain the return value of `_format_metric_cell`.
    """
    level = {}

    # Group by the last dimension, drop it, and fill the dict with either the raw metric values or the next level of
    # dicts.
    for key, next_row in row_data.groupby(level=1):
        next_row.reset_index(level=1, drop=True, inplace=True)

        df_key = format_metric_key(metric.key)
        level[key] = _render_dimensional_metric_cell(next_row, metric) \
            if isinstance(next_row.index, pd.MultiIndex) \
            else _format_metric_cell(next_row[df_key], metric)

    return level
Example #15
0
def test_first_last_nth_dtypes(df_mixed_floats):

    df = df_mixed_floats.copy()
    df['E'] = True
    df['F'] = 1

    # tests for first / last / nth
    grouped = df.groupby('A')
    first = grouped.first()
    expected = df.loc[[1, 0], ['B', 'C', 'D', 'E', 'F']]
    expected.index = Index(['bar', 'foo'], name='A')
    expected = expected.sort_index()
    assert_frame_equal(first, expected)

    last = grouped.last()
    expected = df.loc[[5, 7], ['B', 'C', 'D', 'E', 'F']]
    expected.index = Index(['bar', 'foo'], name='A')
    expected = expected.sort_index()
    assert_frame_equal(last, expected)

    nth = grouped.nth(1)
    expected = df.loc[[3, 2], ['B', 'C', 'D', 'E', 'F']]
    expected.index = Index(['bar', 'foo'], name='A')
    expected = expected.sort_index()
    assert_frame_equal(nth, expected)

    # GH 2763, first/last shifting dtypes
    idx = lrange(10)
    idx.append(9)
    s = Series(data=lrange(11), index=idx, name='IntCol')
    assert s.dtype == 'int64'
    f = s.groupby(level=0).first()
    assert f.dtype == 'int64'
Example #16
0
    def test_custom_grouper(self):

        dti = DatetimeIndex(freq='Min', start=datetime(2005,1,1),
                            end=datetime(2005,1,10))

        data = np.array([1]*len(dti))
        s = Series(data, index=dti)

        b = TimeGrouper(Minute(5))
        g = s.groupby(b)

        self.assertEquals(g.ngroups, 2593)

        # construct expected val
        arr = [5] * 2592
        arr.append(1)
        idx = dti[0:-1:5]
        idx = idx.append(DatetimeIndex([np.datetime64(dti[-1])]))
        expect = Series(arr, index=idx)

        # cython returns float for now
        result = g.agg(np.sum)
        assert_series_equal(result, expect.astype(float))

        data = np.random.rand(len(dti), 10)
        df = DataFrame(data, index=dti)
        r = df.groupby(b).agg(np.sum)

        self.assertEquals(len(r.columns), 10)
        self.assertEquals(len(r.index), 2593)
Example #17
0
def test_nsmallest():
    a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10])
    b = Series(list('a' * 5 + 'b' * 5))
    gb = a.groupby(b)
    r = gb.nsmallest(3)
    e = Series([
        1, 2, 3, 0, 4, 6
    ], index=MultiIndex.from_arrays([list('aaabbb'), [0, 4, 1, 6, 7, 8]]))
    tm.assert_series_equal(r, e)

    a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0])
    gb = a.groupby(b)
    e = Series([
        0, 1, 1, 0, 1, 2
    ], index=MultiIndex.from_arrays([list('aaabbb'), [4, 1, 0, 9, 8, 7]]))
    tm.assert_series_equal(gb.nsmallest(3, keep='last'), e)
Example #18
0
def quarter_plot(x, dates=None, ylabel=None, ax=None):
    """
    Seasonal plot of quarterly data

    Parameters
    ----------
    x : array-like
        Seasonal data to plot. If dates is None, x must be a pandas object
        with a PeriodIndex or DatetimeIndex with a monthly frequency.
    dates : array-like, optional
        If `x` is not a pandas object, then dates must be supplied.
    ylabel : str, optional
        The label for the y-axis. Will attempt to use the `name` attribute
        of the Series.
    ax : matplotlib.axes, optional
        Existing axes instance.

    Returns
    -------
    matplotlib.Figure
    """
    from pandas import DataFrame

    if dates is None:
        from statsmodels.tools.data import _check_period_index
        _check_period_index(x, freq="Q")
    else:
        from pandas import Series, PeriodIndex
        x = Series(x, index=PeriodIndex(dates, freq="Q"))

    xticklabels = ['q1', 'q2', 'q3', 'q4']
    return seasonal_plot(x.groupby(lambda y : y.quarter), xticklabels,
                         ylabel=ylabel, ax=ax)
Example #19
0
    def test_count_level_series(self):
        index = MultiIndex(levels=[['foo', 'bar', 'baz'],
                                   ['one', 'two', 'three', 'four']],
                           labels=[[0, 0, 0, 2, 2],
                                   [2, 0, 1, 1, 2]])

        s = Series(np.random.randn(len(index)), index=index)

        result = s.count(level=0)
        expected = s.groupby(level=0).count()
        assert_series_equal(result.astype('f8'),
                            expected.reindex(result.index).fillna(0))

        result = s.count(level=1)
        expected = s.groupby(level=1).count()
        assert_series_equal(result.astype('f8'),
                            expected.reindex(result.index).fillna(0))
Example #20
0
    def test_cython_fail_agg(self):
        dr = bdate_range('1/1/2000', periods=50)
        ts = Series(['A', 'B', 'C', 'D', 'E'] * 10, index=dr)

        grouped = ts.groupby(lambda x: x.month)
        summed = grouped.sum()
        expected = grouped.agg(np.sum)
        assert_series_equal(summed, expected)
Example #21
0
def test_intercept_builtin_sum():
    s = Series([1., 2., np.nan, 3.])
    grouped = s.groupby([0, 1, 2, 2])

    result = grouped.agg(compat.builtins.sum)
    result2 = grouped.apply(compat.builtins.sum)
    expected = grouped.sum()
    tm.assert_series_equal(result, expected)
    tm.assert_series_equal(result2, expected)
Example #22
0
class DateAttributes(object):

    def setup(self):
        rng = date_range('1/1/2000', '12/31/2005', freq='H')
        self.year, self.month, self.day = rng.year, rng.month, rng.day
        self.ts = Series(np.random.randn(len(rng)), index=rng)

    def time_len_groupby_object(self):
        len(self.ts.groupby([self.year, self.month, self.day]))
def date_06():
    dates = pd.DatetimeIndex(['1/1/2000', '1/2/2000', '1/2/2000', '1/2/2000',  '1/3/2000'])
    dup_ts=Series(np.arange(5),index=dates)
    print dup_ts
    print dup_ts.index.is_unique
    print dup_ts['1/3/2000']
    print dup_ts['1/2/2000']
    grouped=dup_ts.groupby(level=0)
    print grouped.mean()
    print grouped.count()
Example #24
0
    def test_custom_grouper(self):

        dti = DatetimeIndex(freq='Min', start=datetime(2005,1,1),
                            end=datetime(2005,1,10))

        data = np.array([1]*len(dti))
        s = Series(data, index=dti)

        b = TimeGrouper(Minute(5))
        g = s.groupby(b)

        # check all cython functions work
        funcs = ['add', 'mean', 'prod', 'ohlc', 'min', 'max', 'var']
        for f in funcs:
            g._cython_agg_general(f)

        b = TimeGrouper(Minute(5), closed='right', label='right')
        g = s.groupby(b)
        # check all cython functions work
        funcs = ['add', 'mean', 'prod', 'ohlc', 'min', 'max', 'var']
        for f in funcs:
            g._cython_agg_general(f)


        self.assertEquals(g.ngroups, 2593)
        self.assert_(notnull(g.mean()).all())

        # construct expected val
        arr = [1] + [5] * 2592
        idx = dti[0:-1:5]
        idx = idx.append(dti[-1:])
        expect = Series(arr, index=idx)

        # cython returns float for now
        result = g.agg(np.sum)
        assert_series_equal(result, expect.astype(float))

        data = np.random.rand(len(dti), 10)
        df = DataFrame(data, index=dti)
        r = df.groupby(b).agg(np.sum)

        self.assertEquals(len(r.columns), 10)
        self.assertEquals(len(r.index), 2593)
    def test_downsample_non_unique(self):
        rng = date_range('1/1/2000', '2/29/2000')
        rng2 = rng.repeat(5).values
        ts = Series(np.random.randn(len(rng2)), index=rng2)

        result = ts.resample('M', how='mean')

        expected = ts.groupby(lambda x: x.month).mean()
        self.assertEquals(len(result), 2)
        assert_almost_equal(result[0], expected[1])
        assert_almost_equal(result[1], expected[2])
Example #26
0
class Size:

    def setup(self):
        n = 10**5
        offsets = np.random.randint(n, size=n).astype('timedelta64[ns]')
        dates = np.datetime64('now') + offsets
        self.df = DataFrame({'key1': np.random.randint(0, 500, size=n),
                             'key2': np.random.randint(0, 100, size=n),
                             'value1': np.random.randn(n),
                             'value2': np.random.randn(n),
                             'value3': np.random.randn(n),
                             'dates': dates})
        self.draws = Series(np.random.randn(n))
        labels = Series(['foo', 'bar', 'baz', 'qux'] * (n // 4))
        self.cats = labels.astype('category')

    def time_multi_size(self):
        self.df.groupby(['key1', 'key2']).size()

    def time_category_size(self):
        self.draws.groupby(self.cats).size()
Example #27
0
    def test_groupby_level_with_nas(self, sort):
        # GH 17537
        index = MultiIndex(levels=[[1, 0], [0, 1, 2, 3]],
                           labels=[[1, 1, 1, 1, 0, 0, 0, 0], [0, 1, 2, 3, 0, 1,
                                                              2, 3]])

        # factorizing doesn't confuse things
        s = Series(np.arange(8.), index=index)
        result = s.groupby(level=0, sort=sort).sum()
        expected = Series([6., 22.], index=[0, 1])
        assert_series_equal(result, expected)

        index = MultiIndex(levels=[[1, 0], [0, 1, 2, 3]],
                           labels=[[1, 1, 1, 1, -1, 0, 0, 0], [0, 1, 2, 3, 0,
                                                               1, 2, 3]])

        # factorizing doesn't confuse things
        s = Series(np.arange(8.), index=index)
        result = s.groupby(level=0, sort=sort).sum()
        expected = Series([6., 18.], index=[0.0, 1.0])
        assert_series_equal(result, expected)
def slide7():
    from pandas.tseries.offsets import Hour, Minute
    hour = Hour()
    print hour
    four_hours = Hour(4)
    print four_hours
    print pd.date_range('1/1/2000', '1/3/2000 23:59', freq='4h')

    print Hour(2) + Minute(30)
    print pd.date_range('1/1/2000', periods=10, freq='1h30min')

    ts = Series(np.random.randn(4),
                index=pd.date_range('1/1/2000', periods=4, freq='M'))
    print ts
    print ts.shift(2)
    print ts.shift(-2)
    print '2 M'
    print ts.shift(2, freq='M')
    print '3 D'
    print ts.shift(3, freq='D')
    print '1 3D'
    print ts.shift(1, freq='3D')
    print '1 90T'
    print ts.shift(1, freq='90T')

    print 'shifting dates with offsets'
    from pandas.tseries.offsets import Day, MonthEnd
    now = datetime(2011, 11, 17)
    print now + 3 * Day()
    print now + MonthEnd()
    print now + MonthEnd(2)

    offset = MonthEnd()
    print offset
    print offset.rollforward(now)
    print offset.rollback(now)

    ts = Series(np.random.randn(20),
                index=pd.date_range('1/15/2000', periods=20, freq='4d'))
    print ts.groupby(offset.rollforward).mean()
def slide4():
    dates = [datetime(2011, 1, 2), datetime(2011, 1, 5), datetime(2011, 1, 7),
        datetime(2011, 1, 8), datetime(2011, 1, 10), datetime(2011, 1, 12)]

    print 'Series sample'
    ts = Series(np.random.randn(6), index=dates)
    print ts
    print type(ts)
    print ts.index

    print 'arithmetic operations'
    print ts + ts[::2]
    print ts.index.dtype

    stamp = ts.index[2]
    print stamp
    print 'indexing'
    print ts[stamp]
    print ts['1/10/2011']
    print ts['20110110']

    longer_ts = Series(np.random.randn(1000),
                       index=pd.date_range('1/1/2000', periods=1000))
    print 'longer timestamp'
    print longer_ts
    print longer_ts['2001']
    print longer_ts['2001-05']
    print 'indexing range'
    print ts[datetime(2011, 1, 7):]
    print ts['1/6/2011':'1/11/2011']

    print 'truncate'
    print ts.truncate(after='1/9/2011')

    dates = pd.date_range('1/1/2000', periods=100, freq='W-WED')
    long_df = DataFrame(np.random.randn(100, 4),
                        index=dates,
                        columns=['Colorado', 'Texas', 'New York', 'Ohio'])
    print long_df.ix['5-2001']

    print 'duplicate'
    dates = pd.DatetimeIndex(['1/1/2000', '1/2/2000', '1/2/2000', '1/2/2000',
                              '1/3/2000'])
    dup_ts = Series(np.arange(5), index=dates)
    print dup_ts
    print dup_ts.index.is_unique
    print dup_ts['1/3/2000']
    print dup_ts['1/2/2000']

    grouped = dup_ts.groupby(level=0)
    print grouped.mean()
    print grouped.count()
Example #30
0
    def test_custom_grouper(self):

        dti = DatetimeIndex(freq="Min", start=datetime(2005, 1, 1), end=datetime(2005, 1, 10))

        s = Series(np.array([1] * len(dti)), index=dti, dtype="int64")

        b = TimeGrouper(Minute(5))
        g = s.groupby(b)

        # check all cython functions work
        funcs = ["add", "mean", "prod", "ohlc", "min", "max", "var"]
        for f in funcs:
            g._cython_agg_general(f)

        b = TimeGrouper(Minute(5), closed="right", label="right")
        g = s.groupby(b)
        # check all cython functions work
        funcs = ["add", "mean", "prod", "ohlc", "min", "max", "var"]
        for f in funcs:
            g._cython_agg_general(f)

        self.assertEquals(g.ngroups, 2593)
        self.assert_(notnull(g.mean()).all())

        # construct expected val
        arr = [1] + [5] * 2592
        idx = dti[0:-1:5]
        idx = idx.append(dti[-1:])
        expect = Series(arr, index=idx)

        # GH2763 - return in put dtype if we can
        result = g.agg(np.sum)
        assert_series_equal(result, expect)

        df = DataFrame(np.random.rand(len(dti), 10), index=dti, dtype="float64")
        r = df.groupby(b).agg(np.sum)

        self.assertEquals(len(r.columns), 10)
        self.assertEquals(len(r.index), 2593)
Example #31
0
def test_filter_series():
    s = Series([1, 3, 20, 5, 22, 24, 7])
    expected_odd = Series([1, 3, 5, 7], index=[0, 1, 3, 6])
    expected_even = Series([20, 22, 24], index=[2, 4, 5])
    grouper = s.apply(lambda x: x % 2)
    grouped = s.groupby(grouper)
    tm.assert_series_equal(grouped.filter(lambda x: x.mean() < 10),
                           expected_odd)
    tm.assert_series_equal(grouped.filter(lambda x: x.mean() > 10),
                           expected_even)
    # Test dropna=False.
    tm.assert_series_equal(
        grouped.filter(lambda x: x.mean() < 10, dropna=False),
        expected_odd.reindex(s.index),
    )
    tm.assert_series_equal(
        grouped.filter(lambda x: x.mean() > 10, dropna=False),
        expected_even.reindex(s.index),
    )
Example #32
0
    def test_multi_iter(self):
        s = Series(np.arange(6))
        k1 = np.array(["a", "a", "a", "b", "b", "b"])
        k2 = np.array(["1", "2", "1", "2", "1", "2"])

        grouped = s.groupby([k1, k2])

        iterated = list(grouped)
        expected = [
            ("a", "1", s[[0, 2]]),
            ("a", "2", s[[1]]),
            ("b", "1", s[[4]]),
            ("b", "2", s[[3, 5]]),
        ]
        for i, ((one, two), three) in enumerate(iterated):
            e1, e2, e3 = expected[i]
            assert e1 == one
            assert e2 == two
            tm.assert_series_equal(three, e3)
Example #33
0
    def test_groupby_rolling_index_changed(self, func):
        # GH: #36018 nlevels of MultiIndex changed
        ds = Series(
            [1, 2, 2],
            index=pd.MultiIndex.from_tuples([("a", "x"), ("a", "y"),
                                             ("c", "z")],
                                            names=["1", "2"]),
            name="a",
        )

        result = getattr(ds.groupby(ds).rolling(2), func)()
        expected = Series(
            [np.nan, np.nan, 2.0],
            index=pd.MultiIndex.from_tuples([(1, "a", "x"), (2, "a", "y"),
                                             (2, "c", "z")],
                                            names=["a", "1", "2"]),
            name="a",
        )
        tm.assert_series_equal(result, expected)
Example #34
0
def ts_rank(x: pd.Series, d: int or float) -> pd.Series:
    """
    ts_rank(x, d) = time-series rank in the past d days
    :param x:
    :param d:
    :return:
    """
    if isinstance(d, float):
        d = math.floor(d)

    def func(a):
        # 这里sort两次是啥意思?
        return a.argsort().argsort()[-1] + 1

    if isinstance(x.index, pd.MultiIndex):
        return x.groupby(
            level=1).rolling(d).apply(func).droplevel(0).sort_index()
    else:
        return x.rolling(d).apply(func)
Example #35
0
def test_nlargest_mi_grouper():
    # see gh-21411
    npr = np.random.RandomState(123456789)

    dts = date_range("20180101", periods=10)
    iterables = [dts, ["one", "two"]]

    idx = MultiIndex.from_product(iterables, names=["first", "second"])
    s = Series(npr.randn(20), index=idx)

    result = s.groupby("first").nlargest(1)

    exp_idx = MultiIndex.from_tuples(
        [
            (dts[0], dts[0], "one"),
            (dts[1], dts[1], "one"),
            (dts[2], dts[2], "one"),
            (dts[3], dts[3], "two"),
            (dts[4], dts[4], "one"),
            (dts[5], dts[5], "one"),
            (dts[6], dts[6], "one"),
            (dts[7], dts[7], "one"),
            (dts[8], dts[8], "two"),
            (dts[9], dts[9], "one"),
        ],
        names=["first", "first", "second"],
    )

    exp_values = [
        2.2129019979039612,
        1.8417114045748335,
        0.858963679564603,
        1.3759151378258088,
        0.9430284594687134,
        0.5296914208183142,
        0.8318045593815487,
        -0.8476703342910327,
        0.3804446884133735,
        -0.8028845810770998,
    ]

    expected = Series(exp_values, index=exp_idx)
    tm.assert_series_equal(result, expected, check_exact=False, rtol=1e-3)
Example #36
0
def build_ror_data(ror_capacity_ds: pd.Series, timestamps: pd.DatetimeIndex,
                   runoff_dataset: xr.Dataset, runoff_points_region_ds: pd.Series) -> Tuple[pd.Series, pd.DataFrame]:
    """
    Compute total ROR capacities (in GW) and inflow (p.u. of capacity) for a series of regions.

    Parameters
    ----------
    ror_capacity_ds: pd.Series
        Series containing ROR power (GW) capacity per plant, indexed by the region in which the plant is located.
    timestamps: pd.DatetimeIndex
        Time stamps over which the inflows must be computed.
    runoff_dataset: xr.Dataset
        ERA5 runoff dataset
    runoff_points_region_ds: pd.Series
        Indicates in which region each ERA5 point falls.

    Returns
    -------
    ror_capacity_ds: pd.Series
        Series containing ROR power (GW) capacity per region.
    ror_inflows_df: pd.DataFrame
        ROR inflow time-series (p.u. of power capacity) for each region.
    """

    ror_thresholds_fn = f"{data_path}generation/hydro/source/ror_flood_event_thresholds.csv"
    ror_thresholds = pd.read_csv(ror_thresholds_fn, index_col=0)

    ror_capacity_ds = ror_capacity_ds.groupby(ror_capacity_ds.index).sum() * 1e-3

    ror_inflows_df = pd.DataFrame(index=timestamps, columns=ror_capacity_ds.index)
    for region in ror_capacity_ds.index:
        points = runoff_points_region_ds[runoff_points_region_ds == region].index.to_list()
        flood_event_threshold = ror_thresholds.loc[replace_iso2_codes([region[:2]])[0], 'value']
        if points:
            ror_inflows_df[region] = compute_ror_series(runoff_dataset, points, flood_event_threshold)
    ror_inflows_df.dropna(axis=1, inplace=True)
    missing_inflows_indexes = ~ror_capacity_ds.index.isin(ror_inflows_df.columns)
    missing_ror = ror_capacity_ds.loc[missing_inflows_indexes].dropna().sum()
    ror_capacity_ds = ror_capacity_ds[ror_inflows_df.columns]
    logger.info(f'ROR capacity factors computed. '
                f'{missing_ror} GW removed because of ERA5 point unavailability in regions.')

    return ror_capacity_ds, ror_inflows_df
Example #37
0
    def test_series_groupby_plotting_nominally_works(self):
        n = 10
        weight = Series(np.random.normal(166, 20, size=n))
        height = Series(np.random.normal(60, 10, size=n))
        with tm.RNGContext(42):
            gender = np.random.choice(['male', 'female'], size=n)

        weight.groupby(gender).plot()
        tm.close()
        height.groupby(gender).hist()
        tm.close()
        # Regression test for GH8733
        height.groupby(gender).plot(alpha=0.5)
        tm.close()
Example #38
0
    def compute_ytd_returns(time_series: pd.Series):
        """
        Compute year-to-date from a pandas time series
        :param time_series: pandas time series
        :return: pandas time series
        """

        ytd_returns = []

        group_by_object = time_series.groupby(pd.Grouper(freq='A'))
        for group_name, indexes in group_by_object.indices.items():
            start_index = indexes[0] - 1 if indexes[0] > 0 else indexes[0]
            for idx in indexes:
                ytd_return = time_series[idx] / time_series[start_index] - 1.0
                ytd_returns.append(ytd_return)

        result = pd.Series(index=time_series.index, data=ytd_returns)
        assert len(result) == len(time_series)

        return result
def _group_mean(weights, ordered_snapshots):
    w = Series(weights)
    w = w.reindex(ordered_snapshots)

    nstates = 16
    snapshots_per_state = len(w) / nstates

    state_indices = []
    for i in range(nstates):
        state_indices += [i] * snapshots_per_state
    assert len(state_indices) == len(w)

    g_mean = w.groupby(state_indices).mean()
    states = []
    mean_w = []
    for i, v in g_mean.iteritems():
        states.append(i)
        mean_w.append(v)

    return np.array(states, dtype=int), np.array(mean_w, dtype=float)
Example #40
0
def _calibrate_by_index(value_df: pd.DataFrame,
                        calib_series: pd.Series,
                        levels: Union[int, Tuple[int]],
                        ) -> pd.DataFrame:
    if not hasattr(levels, '__len__'):
        levels = (levels, )
    value_factors = value_df.groupby(level=levels).sum()
    calib_sums = calib_series.groupby(level=levels).sum()
    # calib_fractions = value_sums.join(calib_sums, how='outer')
    for col in value_factors:
        value_factors[col] = (calib_sums / value_factors[col]).fillna(1)
    # raise RuntimeError
    with_coefs = value_df.join(
        value_factors,
        on=[name for i, name in enumerate(value_df.index.names) if i in levels],
        rsuffix='_coef'
    )
    for col in value_df.columns:
        with_coefs[col] *= with_coefs[col + '_coef']
    return with_coefs[value_df.columns.tolist()]
Example #41
0
def get_feature(x: pd.Series, date):
    x = x.dropna()
    name = x.name
    x = pd.DataFrame(x.to_list(), columns=["road_id", "speed", "clock"])
    x = x[x["speed"] <= 30]
    x["clock"] = x.apply(lambda x: pd.Timestamp(2019, int(date[
        0:2]), int(date[2:]), x["clock"][0], x["clock"][1] // 10 * 10, 0),
                         axis=1)

    ret = x.groupby(by=["road_id", "clock"],
                    as_index=False).agg({"speed": ["mean", lambda x: len(x)]})

    ret.columns = ["road_id", "time", "avg", "count"]
    ret = ret[ret["count"] >= 5]

    ret["is_low"] = (ret["avg"] <= 6).astype(np.float64)
    # ret["is_high"] = (ret["avg"] >= 14).astype(np.float64)

    del ret["count"]
    return ret
Example #42
0
def decay_linear(x: pd.Series, d: int) -> pd.Series:
    """
    decay_linear(x, d) = weighted moving average over the past d days
    with linearly decaying weights d, d – 1, ..., 1 (rescaled to sum up to 1)
    :param x:
    :param d:
    :return:
    """
    # todo https://www.joinquant.com/community/post/detailMobile?postId=10674&page=&limit=20&replyId=&tag=
    if isinstance(d, float):
        d = math.floor(d)

    def func(a):
        weights = np.arange(1, d + 1)
        weights = weights / weights.sum()
        return np.nansum(weights * a)

    if isinstance(x.index, pd.MultiIndex):
        return x.groupby(level=1).rolling(d).apply(func)
    else:
        return x.rolling(d).apply(func)
Example #43
0
 def _fit_distributors(
     self,
     original: pd.Series,
     transformed: pd.Series,
 ) -> List[Distributor]:
     distributors = []
     for cat, bin_vals in original.groupby(transformed):
         if len(bin_vals.index) > 0:
             n_unique = bin_vals.nunique()
             if n_unique < self.min_unique_continuous:
                 d = self.discrete_distributor.copy()
             else:
                 d = self.continuous_distributor.copy()
             d.fit(bin_vals.values)
         else:
             # no values in bin, return a mean-producing distributor
             # at the center of the interval
             d = MeanDistributor(seed=self.continuous_distributor.seed)
             d.fit(np.array([cat.left, cat.right]))
         distributors.append(d)
     return distributors
Example #44
0
    def test_groupby_rolling_group_keys(self, group_keys):
        # GH 37641
        # GH 38523: GH 37641 actually was not a bug.
        # group_keys only applies to groupby.apply directly
        arrays = [["val1", "val1", "val2"], ["val1", "val1", "val2"]]
        index = MultiIndex.from_arrays(arrays, names=("idx1", "idx2"))

        s = Series([1, 2, 3], index=index)
        result = s.groupby(["idx1", "idx2"], group_keys=group_keys).rolling(1).mean()
        expected = Series(
            [1.0, 2.0, 3.0],
            index=MultiIndex.from_tuples(
                [
                    ("val1", "val1", "val1", "val1"),
                    ("val1", "val1", "val1", "val1"),
                    ("val2", "val2", "val2", "val2"),
                ],
                names=["idx1", "idx2", "idx1", "idx2"],
            ),
        )
        tm.assert_series_equal(result, expected)
Example #45
0
    def test_resample_basic(self):
        rng = date_range('1/1/2000 00:00:00', '1/1/2000 00:13:00', freq='min',
                         name='index')
        s = Series(np.random.randn(14), index=rng)
        result = s.resample('5min', how='mean', closed='right', label='right')
        expected = Series([s[0], s[1:6].mean(), s[6:11].mean(), s[11:].mean()],
                          index=date_range('1/1/2000', periods=4, freq='5min'))
        assert_series_equal(result, expected)
        self.assertEqual(result.index.name, 'index')

        result = s.resample('5min', how='mean', closed='left', label='right')
        expected = Series([s[:5].mean(), s[5:10].mean(), s[10:].mean()],
                          index=date_range('1/1/2000 00:05', periods=3,
                                           freq='5min'))
        assert_series_equal(result, expected)

        s = self.series
        result = s.resample('5Min', how='last')
        grouper = TimeGrouper(Minute(5), closed='left', label='left')
        expect = s.groupby(grouper).agg(lambda x: x[-1])
        assert_series_equal(result, expect)
Example #46
0
def calculate_cross_section_factor_returns(data: pd.DataFrame, position: pd.Series, price_key='close',
                                           factor_name='cross_sectional_factor') -> pd.DataFrame:
    """

    :param data:
    :param position:
    :param price_key:
    :param factor_name:
    :return:
    """

    # todo returns with different holding period
    # first shift the factor by date, because the factor can only decide future return
    shifted_position = position.groupby(level=1).shift(1)
    rate_of_return = data[price_key].groupby(level=1).pct_change()
    # do multiple elementwise
    factor_returns = pd.DataFrame(shifted_position.values * rate_of_return.values, index=rate_of_return.index)
    # sum up the return in the same date.
    factor_returns = factor_returns.groupby(level=0).sum()
    factor_returns.rename({0: factor_name}, axis=1, inplace=True)
    return factor_returns
Example #47
0
def part2():
    """时间序列基础"""
    # 1、pandas最基本的时间序列:以时间戳为索引的Series
    dates = [datetime(2011,1,2), datetime(2011,1,5), datetime(2011,1,7),
             datetime(2011,1,8), datetime(2011,1,10), datetime(2011,1,12)]
    ts = Series(np.random.rand(6), index=dates)
    # print(ts)
    # print(type(ts))
    # print(ts.index)
    # print(ts.index[0])

    # 索引、选取、子集构造
    # print(ts[ts.index[2]])  # 利用索引
    # print(ts['1/10/2011'])  # 利用被解释为日期的字符串

    longer_ts = Series(np.random.randn(1000),
                       index=pd.date_range('1/1/2000', periods=1000))
    # print(longer_ts)
    # print(longer_ts['2001-05'])  # 较长的时间序列,只需传入年或者年月,即可选取数据切片
    # print(ts[datetime(2011, 1, 7)])
    # print(ts['1/6/2011': '1/10/2011'])  # 范围查询

    dates = pd.date_range('1/1/2000', periods=100, freq='W-WED')
    long_df = DataFrame(np.random.rand(100, 4),
                        index=dates,
                        columns=['bj', 'hb', 'cd', 'hn'])
    # print(long_df.ix['2001-05'])  # 行索引

    # 带重复索引的时间序列
    dates = pd.DatetimeIndex(['1/1/2000', '1/2/2000', '1/2/2000', '1/2/2000', '1/3/2000'])
    dup_ts = Series(np.arange(5), index=dates)
    # print(dup_ts)
    # print(dup_ts.index.is_unique)   # index不唯一
    # print(dup_ts['1/3/2000'])  # 索引唯一,产生标量
    # print(dup_ts['1/2/2000'])  # 索引不唯一,产生切片

    # want:对具有非唯一时间戳的数据进行聚合:groupby,并传入level=0(索引的唯一一层)
    grouped = dup_ts.groupby(level=0)
    print(grouped.mean())
    print(grouped.count())
Example #48
0
def test_series_groupby_value_counts_on_categorical():
    # GH38672

    s = Series(Categorical(["a"], categories=["a", "b"]))
    result = s.groupby([0]).value_counts()

    expected = Series(
        data=[1, 0],
        index=MultiIndex.from_arrays([
            [0, 0],
            CategoricalIndex(["a", "b"],
                             categories=["a", "b"],
                             ordered=False,
                             dtype="category"),
        ]),
    )

    # Expected:
    # 0  a    1
    #    b    0
    # dtype: int64

    tm.assert_series_equal(result, expected)
Example #49
0
    def test_groupby_dict_mapping(self):
        # GH #679
        from pandas import Series
        s = Series({'T1': 5})
        result = s.groupby({'T1': 'T2'}).agg(sum)
        expected = s.groupby(['T2']).agg(sum)
        assert_series_equal(result, expected)

        s = Series([1., 2., 3., 4.], index=list('abcd'))
        mapping = {'a': 0, 'b': 0, 'c': 1, 'd': 1}

        result = s.groupby(mapping).mean()
        result2 = s.groupby(mapping).agg(np.mean)
        expected = s.groupby([0, 0, 1, 1]).mean()
        expected2 = s.groupby([0, 0, 1, 1]).mean()
        assert_series_equal(result, expected)
        assert_series_equal(result, result2)
        assert_series_equal(result, expected2)
Example #50
0
def stats_items_dist(iter, fn_conv, name, splitter):
    logger.info("Stating %s", name)
    n2s = numberic2SignalFn(int, splitter.range_sum)

    volumes = []
    for i in iter:
        iv = fn_conv(i)
        s = n2s(iv)
        volumes.append(s)

    cs = Series(volumes)
    g = cs.groupby(cs.values).agg(len)
    total = len(volumes)
    sig2prob = {}
    for sig, count in g.iteritems():
        if sig == 0:
            logger.warn("%s sig %s -- %s %s %s", name, sig,
                        count * 1.0 / total, count, total)
            total -= count
        else:
            logger.info("%s sig %s -- %s %s %s", name, sig,
                        count * 1.0 / total, count, total)
            sig2prob[sig] = count * 1.0 / total
    return sig2prob
Example #51
0
    def test_groupby_dict_mapping(self):
        # GH #679
        from pandas import Series

        s = Series({"T1": 5})
        result = s.groupby({"T1": "T2"}).agg(sum)
        expected = s.groupby(["T2"]).agg(sum)
        tm.assert_series_equal(result, expected)

        s = Series([1.0, 2.0, 3.0, 4.0], index=list("abcd"))
        mapping = {"a": 0, "b": 0, "c": 1, "d": 1}

        result = s.groupby(mapping).mean()
        result2 = s.groupby(mapping).agg(np.mean)
        expected = s.groupby([0, 0, 1, 1]).mean()
        expected2 = s.groupby([0, 0, 1, 1]).mean()
        tm.assert_series_equal(result, expected)
        tm.assert_series_equal(result, result2)
        tm.assert_series_equal(result, expected2)
Example #52
0
def data_density_filter(x,
                        y,
                        conv_matrix=None,
                        min_count=5,
                        return_figures=True):
    """
    Use the 2D density cloud of observations to find outliers for any variables

    The data density filter needs tuning to work well.
    This uses convolution to create the density cloud - you can specify
    the exact convolution matrix, or its shape

    Parameters
    ----------
    x : np.array / pd.Series, shape=[n, ]
        e.g. temperature
    y : np.array / pd.Series, shape=[n, ]
        e.g. salinity
    conv_matrix : int, list, np.array, optional
        int = size of the isotropic round convolution window.
        [int, int] = anisotropic (oval) convoltion window.
        2d array is a weighted convolution window, where;
        rectangle = np.ones([int, int]);
        oval = gt.tools.gaussian_kernel([int, int]) > 1e-5
        more advanced anisotropic windows can also be created
    min_count : int, default=5, optional
        masks the 2d histogram counts smaller than this limit when performing
        the convolution
    return_figures : bool, default=True, optional
        returns figures of the data plotted for blob detection...

    Returns
    -------
    mask : np.array, shape=[n, ]
        a mask that returns only values
    figure :
        only returned if return_figure is True

    """
    from scipy.signal import convolve2d
    from pandas import Series, cut
    from numpy import linspace, c_, where, inf, array, isnan

    def gaussian_kernel(*shape):
        """
        Returns a 2D array with gaussian values to be used in the blob_outliers
        detection function. Can be anisotripic (oblong). Scaling is determined
        automatically.

        Parameters
        ----------
        shape : int, int
            if one integer is passed the kernel will be isotropic
            if two integers are passed the kernel will be anisotropic

        Returns
        -------
        array  (float)
            The 2D representation of the kernel
        """
        from matplotlib.cbook import flatten
        from numpy import exp, mgrid

        # make shape a list regardless of input
        shape = [int(a // 2) for a in flatten([shape])]
        # anisotropic if len(2) else isotropic
        if len(shape) == 1:
            sx, sy = shape[0], shape[0]
        elif len(shape) == 2:
            sx, sy = shape

        # create the x and y grid
        x, y = mgrid[-sx:sx + 1, -sy:sy + 1]
        sigma = [sx / 8, sy / 8]  # sigma scaled by shape
        c = tuple([sx, sy])  # centre index of x and y
        g = 1 * exp(-((x - x[c])**2 / (2 * sigma[0])**2 + (y - y[c])**2 /
                      (2 * sigma[1])**2))
        return g

    # turning input into pandas.Series
    x = Series(x, name='X' if not isinstance(x, Series) else x.name)
    y = Series(y, name='Y' if not isinstance(y, Series) else y.name)

    ###############
    ##  BINNING  ##
    ###############
    # create bins for the data - equal bins
    xbins = linspace(x.min(), x.max(), 250)
    ybins = linspace(y.min(), y.max(), 250)
    # binning the data with pandas. This is quick to find outliers at the end
    xcut = cut(x, xbins, labels=c_[xbins[:-1], xbins[1:]].mean(1), right=False)
    ycut = cut(y, ybins, labels=c_[ybins[:-1], ybins[1:]].mean(1), right=False)

    # binning the data and returning as a 2D array (pandas.DataFrame)
    count = x.groupby([xcut, ycut]).count()
    count.name = 'count'  # to avoid an error when unstacking
    count = count.unstack()
    count = count.sort_index().sort_index(axis=1)

    ###################
    ##  CONVOLUTION  ##
    ###################
    # make convolution matrix if not given
    if conv_matrix is None:
        conv_matrix = (gaussian_kernel(21) > 1e-5).astype(int)
    elif isinstance(conv_matrix, (list, int, float)):
        conv_matrix = (gaussian_kernel(conv_matrix) > 1e-5).astype(int)
    else:
        ndim = array(conv_matrix).ndim
        if ndim != 2:
            raise UserWarning('conv_matrix must have 2 dimensions')
    # An array with which the convolution is done
    # use a threshold to mask out bins with low counts
    # thus only dense regions of data are considered
    count0 = count.fillna(0).values
    count0[count0 < min_count] = 0
    # 2d convolution with the input matrix
    convolved_count = convolve2d(count0, conv_matrix, mode='same')
    outliers = (convolved_count == 0) & ~isnan(count)

    cols = count.index
    rows = count.columns

    ########################################
    ##  FINDING OUTLIERS AND CREATE MASK  ##
    ########################################
    # find indicies of of the where there is no convolution,
    # but there are data. Then get the x and y values of these
    # points. Turn these into pairs for pandas multi-indexing.
    i, j = where(outliers)
    xi = cols[i].values
    yj = rows[j].values
    ij = list(zip(xi, yj))
    # Create a pandas dataframe with the pd.cut data as indicies
    # with a column for a numerical index.
    if len(ij) > 0:
        idx = x.to_frame().reset_index().drop(x.name, axis=1)
        idx = idx.set_axis([xcut, ycut], inplace=False)
        idx = idx.loc[ij]['index'].values
    else:
        idx = None
    # create a placeholder mask and fill outliers with True
    mask = (x > inf).values
    mask[idx] = True

    ###############
    ##  FIGURES  ##
    ###############
    if return_figures:
        from numpy import ma, power, diff, sum, r_
        from matplotlib import colors, cm, pyplot as plt
        # x and y plotting coordinates
        xp = cols.values.astype(float)
        yp = rows.values.astype(float)
        # plotting variables a, b, c
        a = ma.masked_invalid(count.T, 0)
        b = convolved_count.T
        c = ma.masked_where(a.mask, ~outliers.T)

        # create the figure
        fig, ax = plt.subplots(1, 2, figsize=[10, 5], dpi=90, sharey=True)
        # properties for the pcolormesh and contours
        pn = colors.PowerNorm(0.3)
        mesh_props = dict(cmap=cm.Spectral_r, norm=pn)
        # create the pcolormesh plots
        im = (ax[0].pcolormesh(xp, yp, a, vmax=a.max() / 2,
                               **mesh_props), ax[1].pcolormesh(xp,
                                                               yp,
                                                               c,
                                                               vmin=0,
                                                               vmax=1))
        ct = ax[1].contour(xp,
                           yp,
                           b,
                           levels=[0.5],
                           linestyles='-',
                           colors='r',
                           linewidths=2)

        # change figure parameters
        ax[0].set_title('Histogram of data (min_count = {})'.format(min_count))
        ax[1].set_title(
            '{} Outliers found using \n{} convolution with decision boundary'.
            format(mask.sum(), str(conv_matrix.shape)))
        ax[0].set_xticks([])
        ax[0].set_ylabel(y.name)
        ax[1].set_xlabel(x.name)

        # tight layout before creating the axes for pcolomesh plots
        fig.tight_layout()

        # make colorbar axes based on axes [0, 1]
        p = ax[0].get_position()
        cax = fig.add_axes([p.x0, p.y0 - 0.05, p.width, 0.04])
        cb = plt.colorbar(im[0], cax=cax, orientation='horizontal')
        cb.set_label('Count')
        cb.set_ticks([1, 2, 3, 5, 10, 30, 80, 200])
        # plot the min_count on the colorbar
        cx = pn(r_[cb.get_clim(), min_count])[-1]
        cb.ax.plot(cx, 0, marker='^', color='k', markersize=8)
        cb.ax.plot(cx, 1, marker='v', color='k', markersize=8)
        return mask, fig

    return mask
Example #53
0
    def test_resample_basic(self):
        rng = date_range('1/1/2000 00:00:00', '1/1/2000 00:13:00', freq='min',
                         name='index')
        s = Series(np.random.randn(14), index=rng)
        result = s.resample('5min', how='mean', closed='right', label='right')
        expected = Series([s[0], s[1:6].mean(), s[6:11].mean(), s[11:].mean()],
                          index=date_range('1/1/2000', periods=4, freq='5min'))
        assert_series_equal(result, expected)
        self.assert_(result.index.name == 'index')

        result = s.resample('5min', how='mean', closed='left', label='right')
        expected = Series([s[:5].mean(), s[5:10].mean(), s[10:].mean()],
                          index=date_range('1/1/2000 00:05', periods=3,
                                           freq='5min'))
        assert_series_equal(result, expected)

        s = self.series
        result = s.resample('5Min', how='last')
        grouper = TimeGrouper(Minute(5), closed='right', label='right')
        expect = s.groupby(grouper).agg(lambda x: x[-1])
        assert_series_equal(result, expect)

        # from daily
        dti = DatetimeIndex(start=datetime(2005,1,1), end=datetime(2005,1,10),
                            freq='D', name='index')

        s = Series(np.random.rand(len(dti)), dti)

        # to weekly
        result = s.resample('w-sun', how='last')

        self.assertEquals(len(result), 3)
        self.assert_((result.index.dayofweek == [6,6,6]).all())
        self.assertEquals(result.irow(0), s['1/2/2005'])
        self.assertEquals(result.irow(1), s['1/9/2005'])
        self.assertEquals(result.irow(2), s.irow(-1))

        result = s.resample('W-MON', how='last')
        self.assertEquals(len(result), 2)
        self.assert_((result.index.dayofweek == [0,0]).all())
        self.assertEquals(result.irow(0), s['1/3/2005'])
        self.assertEquals(result.irow(1), s['1/10/2005'])

        result = s.resample('W-TUE', how='last')
        self.assertEquals(len(result), 2)
        self.assert_((result.index.dayofweek == [1,1]).all())
        self.assertEquals(result.irow(0), s['1/4/2005'])
        self.assertEquals(result.irow(1), s['1/10/2005'])

        result = s.resample('W-WED', how='last')
        self.assertEquals(len(result), 2)
        self.assert_((result.index.dayofweek == [2,2]).all())
        self.assertEquals(result.irow(0), s['1/5/2005'])
        self.assertEquals(result.irow(1), s['1/10/2005'])

        result = s.resample('W-THU', how='last')
        self.assertEquals(len(result), 2)
        self.assert_((result.index.dayofweek == [3,3]).all())
        self.assertEquals(result.irow(0), s['1/6/2005'])
        self.assertEquals(result.irow(1), s['1/10/2005'])

        result = s.resample('W-FRI', how='last')
        self.assertEquals(len(result), 2)
        self.assert_((result.index.dayofweek == [4,4]).all())
        self.assertEquals(result.irow(0), s['1/7/2005'])
        self.assertEquals(result.irow(1), s['1/10/2005'])

        # to biz day
        result = s.resample('B', how='last')
        self.assertEquals(len(result), 6)
        self.assert_((result.index.dayofweek == [0,1,2,3,4,0]).all())
        self.assertEquals(result.irow(0), s['1/3/2005'])
        self.assertEquals(result.irow(1), s['1/4/2005'])
        self.assertEquals(result.irow(5), s['1/10/2005'])
        self.assert_(result.index.name == 'index')
Example #54
0
    def test_ngroup_series_matches_frame(self):
        df = DataFrame({"A": list("aaaba")})
        s = Series(list("aaaba"))

        tm.assert_series_equal(df.groupby(s).ngroup(), s.groupby(s).ngroup())
Example #55
0
    def test_grouper_creation_bug(self):

        # GH 8795
        df = DataFrame({'A': [0, 0, 1, 1, 2, 2], 'B': [1, 2, 3, 4, 5, 6]})
        g = df.groupby('A')
        expected = g.sum()

        g = df.groupby(pd.Grouper(key='A'))
        result = g.sum()
        assert_frame_equal(result, expected)

        result = g.apply(lambda x: x.sum())
        assert_frame_equal(result, expected)

        g = df.groupby(pd.Grouper(key='A', axis=0))
        result = g.sum()
        assert_frame_equal(result, expected)

        # GH14334
        # pd.Grouper(key=...) may be passed in a list
        df = DataFrame({
            'A': [0, 0, 0, 1, 1, 1],
            'B': [1, 1, 2, 2, 3, 3],
            'C': [1, 2, 3, 4, 5, 6]
        })
        # Group by single column
        expected = df.groupby('A').sum()
        g = df.groupby([pd.Grouper(key='A')])
        result = g.sum()
        assert_frame_equal(result, expected)

        # Group by two columns
        # using a combination of strings and Grouper objects
        expected = df.groupby(['A', 'B']).sum()

        # Group with two Grouper objects
        g = df.groupby([pd.Grouper(key='A'), pd.Grouper(key='B')])
        result = g.sum()
        assert_frame_equal(result, expected)

        # Group with a string and a Grouper object
        g = df.groupby(['A', pd.Grouper(key='B')])
        result = g.sum()
        assert_frame_equal(result, expected)

        # Group with a Grouper object and a string
        g = df.groupby([pd.Grouper(key='A'), 'B'])
        result = g.sum()
        assert_frame_equal(result, expected)

        # GH8866
        s = Series(
            np.arange(8, dtype='int64'),
            index=pd.MultiIndex.from_product(
                [list('ab'),
                 range(2),
                 date_range('20130101', periods=2)],
                names=['one', 'two', 'three']))
        result = s.groupby(pd.Grouper(level='three', freq='M')).sum()
        expected = Series([28],
                          index=Index([Timestamp('2013-01-31')],
                                      freq='M',
                                      name='three'))
        assert_series_equal(result, expected)

        # just specifying a level breaks
        result = s.groupby(pd.Grouper(level='one')).sum()
        expected = s.groupby(level='one').sum()
        assert_series_equal(result, expected)
Example #56
0
class TestTimeGrouper(tm.TestCase):
    def setUp(self):
        self.ts = Series(np.random.randn(1000),
                         index=date_range('1/1/2000', periods=1000))

    def test_apply(self):
        grouper = TimeGrouper('A', label='right', closed='right')

        grouped = self.ts.groupby(grouper)

        f = lambda x: x.order()[-3:]

        applied = grouped.apply(f)
        expected = self.ts.groupby(lambda x: x.year).apply(f)

        applied.index = applied.index.droplevel(0)
        expected.index = expected.index.droplevel(0)
        assert_series_equal(applied, expected)

    def test_count(self):
        self.ts[::3] = np.nan

        grouper = TimeGrouper('A', label='right', closed='right')
        result = self.ts.resample('A', how='count')

        expected = self.ts.groupby(lambda x: x.year).count()
        expected.index = result.index

        assert_series_equal(result, expected)

    def test_numpy_reduction(self):
        result = self.ts.resample('A', how='prod', closed='right')

        expected = self.ts.groupby(lambda x: x.year).agg(np.prod)
        expected.index = result.index

        assert_series_equal(result, expected)

    def test_apply_iteration(self):
        # #2300
        N = 1000
        ind = pd.date_range(start="2000-01-01", freq="D", periods=N)
        df = DataFrame({'open': 1, 'close': 2}, index=ind)
        tg = TimeGrouper('M')

        _, grouper, _ = tg._get_grouper(df)

        # Errors
        grouped = df.groupby(grouper, group_keys=False)
        f = lambda df: df['close'] / df['open']

        # it works!
        result = grouped.apply(f)
        self.assertTrue(result.index.equals(df.index))

    def test_panel_aggregation(self):
        ind = pd.date_range('1/1/2000', periods=100)
        data = np.random.randn(2, len(ind), 4)
        wp = pd.Panel(data,
                      items=['Item1', 'Item2'],
                      major_axis=ind,
                      minor_axis=['A', 'B', 'C', 'D'])

        tg = TimeGrouper('M', axis=1)
        _, grouper, _ = tg._get_grouper(wp)
        bingrouped = wp.groupby(grouper)
        binagg = bingrouped.mean()

        def f(x):
            assert (isinstance(x, Panel))
            return x.mean(1)

        result = bingrouped.agg(f)
        tm.assert_panel_equal(result, binagg)

    def test_fails_on_no_datetime_index(self):
        index_names = ('Int64Index', 'PeriodIndex', 'Index', 'Float64Index',
                       'MultiIndex')
        index_funcs = (tm.makeIntIndex, tm.makePeriodIndex,
                       tm.makeUnicodeIndex, tm.makeFloatIndex,
                       lambda m: tm.makeCustomIndex(m, 2))
        n = 2
        for name, func in zip(index_names, index_funcs):
            index = func(n)
            df = DataFrame({'a': np.random.randn(n)}, index=index)
            with tm.assertRaisesRegexp(
                    TypeError, "axis must be a DatetimeIndex, "
                    "but got an instance of %r" % name):
                df.groupby(TimeGrouper('D'))
 def test_named_agg_nametuple(self, inp):
     # GH34422
     s = Series([1, 1, 2, 2, 3, 3, 4, 5])
     msg = f"func is expected but received {type(inp).__name__}"
     with pytest.raises(TypeError, match=msg):
         s.groupby(s.values).agg(a=inp)
Example #58
0
    def test_grouper_creation_bug(self):

        # GH 8795
        df = DataFrame({"A": [0, 0, 1, 1, 2, 2], "B": [1, 2, 3, 4, 5, 6]})
        g = df.groupby("A")
        expected = g.sum()

        g = df.groupby(pd.Grouper(key="A"))
        result = g.sum()
        tm.assert_frame_equal(result, expected)

        result = g.apply(lambda x: x.sum())
        tm.assert_frame_equal(result, expected)

        g = df.groupby(pd.Grouper(key="A", axis=0))
        result = g.sum()
        tm.assert_frame_equal(result, expected)

        # GH14334
        # pd.Grouper(key=...) may be passed in a list
        df = DataFrame({
            "A": [0, 0, 0, 1, 1, 1],
            "B": [1, 1, 2, 2, 3, 3],
            "C": [1, 2, 3, 4, 5, 6]
        })
        # Group by single column
        expected = df.groupby("A").sum()
        g = df.groupby([pd.Grouper(key="A")])
        result = g.sum()
        tm.assert_frame_equal(result, expected)

        # Group by two columns
        # using a combination of strings and Grouper objects
        expected = df.groupby(["A", "B"]).sum()

        # Group with two Grouper objects
        g = df.groupby([pd.Grouper(key="A"), pd.Grouper(key="B")])
        result = g.sum()
        tm.assert_frame_equal(result, expected)

        # Group with a string and a Grouper object
        g = df.groupby(["A", pd.Grouper(key="B")])
        result = g.sum()
        tm.assert_frame_equal(result, expected)

        # Group with a Grouper object and a string
        g = df.groupby([pd.Grouper(key="A"), "B"])
        result = g.sum()
        tm.assert_frame_equal(result, expected)

        # GH8866
        s = Series(
            np.arange(8, dtype="int64"),
            index=pd.MultiIndex.from_product(
                [list("ab"),
                 range(2),
                 date_range("20130101", periods=2)],
                names=["one", "two", "three"],
            ),
        )
        result = s.groupby(pd.Grouper(level="three", freq="M")).sum()
        expected = Series([28],
                          index=Index([Timestamp("2013-01-31")],
                                      freq="M",
                                      name="three"))
        tm.assert_series_equal(result, expected)

        # just specifying a level breaks
        result = s.groupby(pd.Grouper(level="one")).sum()
        expected = s.groupby(level="one").sum()
        tm.assert_series_equal(result, expected)
class TestTimeSeriesDuplicates(object):
    def setup_method(self, method):
        dates = [
            datetime(2000, 1, 2),
            datetime(2000, 1, 2),
            datetime(2000, 1, 2),
            datetime(2000, 1, 3),
            datetime(2000, 1, 3),
            datetime(2000, 1, 3),
            datetime(2000, 1, 4),
            datetime(2000, 1, 4),
            datetime(2000, 1, 4),
            datetime(2000, 1, 5)
        ]

        self.dups = Series(np.random.randn(len(dates)), index=dates)

    def test_constructor(self):
        assert isinstance(self.dups, Series)
        assert isinstance(self.dups.index, DatetimeIndex)

    def test_is_unique_monotonic(self):
        assert not self.dups.index.is_unique

    def test_index_unique(self):
        uniques = self.dups.index.unique()
        expected = DatetimeIndex([
            datetime(2000, 1, 2),
            datetime(2000, 1, 3),
            datetime(2000, 1, 4),
            datetime(2000, 1, 5)
        ])
        assert uniques.dtype == 'M8[ns]'  # sanity
        tm.assert_index_equal(uniques, expected)
        assert self.dups.index.nunique() == 4

        # #2563
        assert isinstance(uniques, DatetimeIndex)

        dups_local = self.dups.index.tz_localize('US/Eastern')
        dups_local.name = 'foo'
        result = dups_local.unique()
        expected = DatetimeIndex(expected, name='foo')
        expected = expected.tz_localize('US/Eastern')
        assert result.tz is not None
        assert result.name == 'foo'
        tm.assert_index_equal(result, expected)

        # NaT, note this is excluded
        arr = [1370745748 + t for t in range(20)] + [tslib.iNaT]
        idx = DatetimeIndex(arr * 3)
        tm.assert_index_equal(idx.unique(), DatetimeIndex(arr))
        assert idx.nunique() == 20
        assert idx.nunique(dropna=False) == 21

        arr = [
            Timestamp('2013-06-09 02:42:28') + timedelta(seconds=t)
            for t in range(20)
        ] + [NaT]
        idx = DatetimeIndex(arr * 3)
        tm.assert_index_equal(idx.unique(), DatetimeIndex(arr))
        assert idx.nunique() == 20
        assert idx.nunique(dropna=False) == 21

    def test_index_dupes_contains(self):
        d = datetime(2011, 12, 5, 20, 30)
        ix = DatetimeIndex([d, d])
        assert d in ix

    def test_duplicate_dates_indexing(self):
        ts = self.dups

        uniques = ts.index.unique()
        for date in uniques:
            result = ts[date]

            mask = ts.index == date
            total = (ts.index == date).sum()
            expected = ts[mask]
            if total > 1:
                assert_series_equal(result, expected)
            else:
                assert_almost_equal(result, expected[0])

            cp = ts.copy()
            cp[date] = 0
            expected = Series(np.where(mask, 0, ts), index=ts.index)
            assert_series_equal(cp, expected)

        pytest.raises(KeyError, ts.__getitem__, datetime(2000, 1, 6))

        # new index
        ts[datetime(2000, 1, 6)] = 0
        assert ts[datetime(2000, 1, 6)] == 0

    def test_range_slice(self):
        idx = DatetimeIndex(
            ['1/1/2000', '1/2/2000', '1/2/2000', '1/3/2000', '1/4/2000'])

        ts = Series(np.random.randn(len(idx)), index=idx)

        result = ts['1/2/2000':]
        expected = ts[1:]
        assert_series_equal(result, expected)

        result = ts['1/2/2000':'1/3/2000']
        expected = ts[1:4]
        assert_series_equal(result, expected)

    def test_groupby_average_dup_values(self):
        result = self.dups.groupby(level=0).mean()
        expected = self.dups.groupby(self.dups.index).mean()
        assert_series_equal(result, expected)

    def test_indexing_over_size_cutoff(self):
        import datetime
        # #1821

        old_cutoff = _index._SIZE_CUTOFF
        try:
            _index._SIZE_CUTOFF = 1000

            # create large list of non periodic datetime
            dates = []
            sec = datetime.timedelta(seconds=1)
            half_sec = datetime.timedelta(microseconds=500000)
            d = datetime.datetime(2011, 12, 5, 20, 30)
            n = 1100
            for i in range(n):
                dates.append(d)
                dates.append(d + sec)
                dates.append(d + sec + half_sec)
                dates.append(d + sec + sec + half_sec)
                d += 3 * sec

            # duplicate some values in the list
            duplicate_positions = np.random.randint(0, len(dates) - 1, 20)
            for p in duplicate_positions:
                dates[p + 1] = dates[p]

            df = DataFrame(np.random.randn(len(dates), 4),
                           index=dates,
                           columns=list('ABCD'))

            pos = n * 3
            timestamp = df.index[pos]
            assert timestamp in df.index

            # it works!
            df.loc[timestamp]
            assert len(df.loc[[timestamp]]) > 0
        finally:
            _index._SIZE_CUTOFF = old_cutoff

    def test_indexing_unordered(self):
        # GH 2437
        rng = date_range(start='2011-01-01', end='2011-01-15')
        ts = Series(np.random.rand(len(rng)), index=rng)
        ts2 = pd.concat([ts[0:4], ts[-4:], ts[4:-4]])

        for t in ts.index:
            # TODO: unused?
            s = str(t)  # noqa

            expected = ts[t]
            result = ts2[t]
            assert expected == result

        # GH 3448 (ranges)
        def compare(slobj):
            result = ts2[slobj].copy()
            result = result.sort_index()
            expected = ts[slobj]
            assert_series_equal(result, expected)

        compare(slice('2011-01-01', '2011-01-15'))
        compare(slice('2010-12-30', '2011-01-15'))
        compare(slice('2011-01-01', '2011-01-16'))

        # partial ranges
        compare(slice('2011-01-01', '2011-01-6'))
        compare(slice('2011-01-06', '2011-01-8'))
        compare(slice('2011-01-06', '2011-01-12'))

        # single values
        result = ts2['2011'].sort_index()
        expected = ts['2011']
        assert_series_equal(result, expected)

        # diff freq
        rng = date_range(datetime(2005, 1, 1), periods=20, freq='M')
        ts = Series(np.arange(len(rng)), index=rng)
        ts = ts.take(np.random.permutation(20))

        result = ts['2005']
        for t in result.index:
            assert t.year == 2005

    def test_indexing(self):

        idx = date_range("2001-1-1", periods=20, freq='M')
        ts = Series(np.random.rand(len(idx)), index=idx)

        # getting

        # GH 3070, make sure semantics work on Series/Frame
        expected = ts['2001']
        expected.name = 'A'

        df = DataFrame(dict(A=ts))
        result = df['2001']['A']
        assert_series_equal(expected, result)

        # setting
        ts['2001'] = 1
        expected = ts['2001']
        expected.name = 'A'

        df.loc['2001', 'A'] = 1

        result = df['2001']['A']
        assert_series_equal(expected, result)

        # GH3546 (not including times on the last day)
        idx = date_range(start='2013-05-31 00:00',
                         end='2013-05-31 23:00',
                         freq='H')
        ts = Series(lrange(len(idx)), index=idx)
        expected = ts['2013-05']
        assert_series_equal(expected, ts)

        idx = date_range(start='2013-05-31 00:00',
                         end='2013-05-31 23:59',
                         freq='S')
        ts = Series(lrange(len(idx)), index=idx)
        expected = ts['2013-05']
        assert_series_equal(expected, ts)

        idx = [
            Timestamp('2013-05-31 00:00'),
            Timestamp(datetime(2013, 5, 31, 23, 59, 59, 999999))
        ]
        ts = Series(lrange(len(idx)), index=idx)
        expected = ts['2013']
        assert_series_equal(expected, ts)

        # GH14826, indexing with a seconds resolution string / datetime object
        df = DataFrame(np.random.rand(5, 5),
                       columns=['open', 'high', 'low', 'close', 'volume'],
                       index=date_range('2012-01-02 18:01:00',
                                        periods=5,
                                        tz='US/Central',
                                        freq='s'))
        expected = df.loc[[df.index[2]]]

        # this is a single date, so will raise
        pytest.raises(
            KeyError,
            df.__getitem__,
            '2012-01-02 18:01:02',
        )
        pytest.raises(
            KeyError,
            df.__getitem__,
            df.index[2],
        )
Example #60
0
    def test_groupby_rolling_center_center(self):
        # GH 35552
        series = Series(range(1, 6))
        result = series.groupby(series).rolling(center=True, window=3).mean()
        expected = Series(
            [np.nan] * 5,
            index=pd.MultiIndex.from_tuples(
                ((1, 0), (2, 1), (3, 2), (4, 3), (5, 4))),
        )
        tm.assert_series_equal(result, expected)

        series = Series(range(1, 5))
        result = series.groupby(series).rolling(center=True, window=3).mean()
        expected = Series(
            [np.nan] * 4,
            index=pd.MultiIndex.from_tuples(((1, 0), (2, 1), (3, 2), (4, 3))),
        )
        tm.assert_series_equal(result, expected)

        df = pd.DataFrame({"a": ["a"] * 5 + ["b"] * 6, "b": range(11)})
        result = df.groupby("a").rolling(center=True, window=3).mean()
        expected = pd.DataFrame(
            [np.nan, 1, 2, 3, np.nan, np.nan, 6, 7, 8, 9, np.nan],
            index=pd.MultiIndex.from_tuples(
                (
                    ("a", 0),
                    ("a", 1),
                    ("a", 2),
                    ("a", 3),
                    ("a", 4),
                    ("b", 5),
                    ("b", 6),
                    ("b", 7),
                    ("b", 8),
                    ("b", 9),
                    ("b", 10),
                ),
                names=["a", None],
            ),
            columns=["b"],
        )
        tm.assert_frame_equal(result, expected)

        df = pd.DataFrame({"a": ["a"] * 5 + ["b"] * 5, "b": range(10)})
        result = df.groupby("a").rolling(center=True, window=3).mean()
        expected = pd.DataFrame(
            [np.nan, 1, 2, 3, np.nan, np.nan, 6, 7, 8, np.nan],
            index=pd.MultiIndex.from_tuples(
                (
                    ("a", 0),
                    ("a", 1),
                    ("a", 2),
                    ("a", 3),
                    ("a", 4),
                    ("b", 5),
                    ("b", 6),
                    ("b", 7),
                    ("b", 8),
                    ("b", 9),
                ),
                names=["a", None],
            ),
            columns=["b"],
        )
        tm.assert_frame_equal(result, expected)