コード例 #1
0
ファイル: test_nth.py プロジェクト: brianholland/pandas
def test_group_selection_cache():
    # GH 12839 nth, head, and tail should return same result consistently
    df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B'])
    expected = df.iloc[[0, 2]].set_index('A')

    g = df.groupby('A')
    result1 = g.head(n=2)
    result2 = g.nth(0)
    assert_frame_equal(result1, df)
    assert_frame_equal(result2, expected)

    g = df.groupby('A')
    result1 = g.tail(n=2)
    result2 = g.nth(0)
    assert_frame_equal(result1, df)
    assert_frame_equal(result2, expected)

    g = df.groupby('A')
    result1 = g.nth(0)
    result2 = g.head(n=2)
    assert_frame_equal(result1, expected)
    assert_frame_equal(result2, df)

    g = df.groupby('A')
    result1 = g.nth(0)
    result2 = g.tail(n=2)
    assert_frame_equal(result1, expected)
    assert_frame_equal(result2, df)
コード例 #2
0
ファイル: test_functional.py プロジェクト: MasonGallo/pandas
    def test_nunique(self):
        df = DataFrame({
            'A': list('abbacc'),
            'B': list('abxacc'),
            'C': list('abbacx'),
        })

        expected = DataFrame({'A': [1] * 3, 'B': [1, 2, 1], 'C': [1, 1, 2]})
        result = df.groupby('A', as_index=False).nunique()
        tm.assert_frame_equal(result, expected)

        # as_index
        expected.index = list('abc')
        expected.index.name = 'A'
        result = df.groupby('A').nunique()
        tm.assert_frame_equal(result, expected)

        # with na
        result = df.replace({'x': None}).groupby('A').nunique(dropna=False)
        tm.assert_frame_equal(result, expected)

        # dropna
        expected = DataFrame({'A': [1] * 3, 'B': [1] * 3, 'C': [1] * 3},
                             index=list('abc'))
        expected.index.name = 'A'
        result = df.replace({'x': None}).groupby('A').nunique()
        tm.assert_frame_equal(result, expected)
コード例 #3
0
ファイル: test_timegrouper.py プロジェクト: sinhrks/pandas
    def test_groupby_groups_datetimeindex(self):
        # GH#1430
        periods = 1000
        ind = pd.date_range(start='2012/1/1', freq='5min', periods=periods)
        df = DataFrame({'high': np.arange(periods),
                        'low': np.arange(periods)}, index=ind)
        grouped = df.groupby(lambda x: datetime(x.year, x.month, x.day))

        # it works!
        groups = grouped.groups
        assert isinstance(list(groups.keys())[0], datetime)

        # GH#11442
        index = pd.date_range('2015/01/01', periods=5, name='date')
        df = pd.DataFrame({'A': [5, 6, 7, 8, 9],
                           'B': [1, 2, 3, 4, 5]}, index=index)
        result = df.groupby(level='date').groups
        dates = ['2015-01-05', '2015-01-04', '2015-01-03',
                 '2015-01-02', '2015-01-01']
        expected = {pd.Timestamp(date): pd.DatetimeIndex([date], name='date')
                    for date in dates}
        tm.assert_dict_equal(result, expected)

        grouped = df.groupby(level='date')
        for date in dates:
            result = grouped.get_group(date)
            data = [[df.loc[date, 'A'], df.loc[date, 'B']]]
            expected_index = pd.DatetimeIndex([date], name='date')
            expected = pd.DataFrame(data,
                                    columns=list('AB'),
                                    index=expected_index)
            tm.assert_frame_equal(result, expected)
コード例 #4
0
ファイル: test_pivot.py プロジェクト: wabu/pandas
    def test_crosstab_margins(self):
        a = np.random.randint(0, 7, size=100)
        b = np.random.randint(0, 3, size=100)
        c = np.random.randint(0, 5, size=100)

        df = DataFrame({'a': a, 'b': b, 'c': c})

        result = crosstab(a, [b, c], rownames=['a'], colnames=('b', 'c'),
                          margins=True)

        self.assertEqual(result.index.names, ('a',))
        self.assertEqual(result.columns.names, ['b', 'c'])

        all_cols = result['All', '']
        exp_cols = df.groupby(['a']).size().astype('i8')
        exp_cols = exp_cols.append(Series([len(df)], index=['All']))

        tm.assert_series_equal(all_cols, exp_cols)

        all_rows = result.ix['All']
        exp_rows = df.groupby(['b', 'c']).size().astype('i8')
        exp_rows = exp_rows.append(Series([len(df)], index=[('All', '')]))

        exp_rows = exp_rows.reindex(all_rows.index)
        exp_rows = exp_rows.fillna(0).astype(np.int64)
        tm.assert_series_equal(all_rows, exp_rows)
コード例 #5
0
ファイル: test_datetime_index.py プロジェクト: Itay4/pandas
def test_resample_timegrouper():
    # GH 7227
    dates1 = [datetime(2014, 10, 1), datetime(2014, 9, 3),
              datetime(2014, 11, 5), datetime(2014, 9, 5),
              datetime(2014, 10, 8), datetime(2014, 7, 15)]

    dates2 = dates1[:2] + [pd.NaT] + dates1[2:4] + [pd.NaT] + dates1[4:]
    dates3 = [pd.NaT] + dates1 + [pd.NaT]

    for dates in [dates1, dates2, dates3]:
        df = DataFrame(dict(A=dates, B=np.arange(len(dates))))
        result = df.set_index('A').resample('M').count()
        exp_idx = pd.DatetimeIndex(['2014-07-31', '2014-08-31',
                                    '2014-09-30',
                                    '2014-10-31', '2014-11-30'],
                                   freq='M', name='A')
        expected = DataFrame({'B': [1, 0, 2, 2, 1]}, index=exp_idx)
        assert_frame_equal(result, expected)

        result = df.groupby(pd.Grouper(freq='M', key='A')).count()
        assert_frame_equal(result, expected)

        df = DataFrame(dict(A=dates, B=np.arange(len(dates)), C=np.arange(
            len(dates))))
        result = df.set_index('A').resample('M').count()
        expected = DataFrame({'B': [1, 0, 2, 2, 1], 'C': [1, 0, 2, 2, 1]},
                             index=exp_idx, columns=['B', 'C'])
        assert_frame_equal(result, expected)

        result = df.groupby(pd.Grouper(freq='M', key='A')).count()
        assert_frame_equal(result, expected)
コード例 #6
0
ファイル: groupby.py プロジェクト: Itay4/pandas
class Size(object):

    def setup(self):
        n = 10**5
        offsets = np.random.randint(n, size=n).astype('timedelta64[ns]')
        dates = np.datetime64('now') + offsets
        self.df = DataFrame({'key1': np.random.randint(0, 500, size=n),
                             'key2': np.random.randint(0, 100, size=n),
                             'value1': np.random.randn(n),
                             'value2': np.random.randn(n),
                             'value3': np.random.randn(n),
                             'dates': dates})
        self.draws = Series(np.random.randn(n))
        labels = Series(['foo', 'bar', 'baz', 'qux'] * (n // 4))
        self.cats = labels.astype('category')

    def time_multi_size(self):
        self.df.groupby(['key1', 'key2']).size()

    def time_dt_timegrouper_size(self):
        with warnings.catch_warnings(record=True):
            self.df.groupby(TimeGrouper(key='dates', freq='M')).size()

    def time_category_size(self):
        self.draws.groupby(self.cats).size()
コード例 #7
0
def test_preserve_categorical_dtype():
    # GH13743, GH13854
    df = DataFrame({'A': [1, 2, 1, 1, 2],
                    'B': [10, 16, 22, 28, 34],
                    'C1': Categorical(list("abaab"),
                                      categories=list("bac"),
                                      ordered=False),
                    'C2': Categorical(list("abaab"),
                                      categories=list("bac"),
                                      ordered=True)})
    # single grouper
    exp_full = DataFrame({'A': [2.0, 1.0, np.nan],
                          'B': [25.0, 20.0, np.nan],
                          'C1': Categorical(list("bac"),
                                            categories=list("bac"),
                                            ordered=False),
                          'C2': Categorical(list("bac"),
                                            categories=list("bac"),
                                            ordered=True)})
    for col in ['C1', 'C2']:
        result1 = df.groupby(by=col, as_index=False, observed=False).mean()
        result2 = df.groupby(
            by=col, as_index=True, observed=False).mean().reset_index()
        expected = exp_full.reindex(columns=result1.columns)
        tm.assert_frame_equal(result1, expected)
        tm.assert_frame_equal(result2, expected)
コード例 #8
0
ファイル: test_time_grouper.py プロジェクト: Itay4/pandas
def test_aggregate_normal(resample_method):
    """Check TimeGrouper's aggregation is identical as normal groupby."""

    if resample_method == 'ohlc':
        pytest.xfail(reason='DataError: No numeric types to aggregate')

    data = np.random.randn(20, 4)
    normal_df = DataFrame(data, columns=['A', 'B', 'C', 'D'])
    normal_df['key'] = [1, 2, 3, 4, 5] * 4

    dt_df = DataFrame(data, columns=['A', 'B', 'C', 'D'])
    dt_df['key'] = [datetime(2013, 1, 1), datetime(2013, 1, 2),
                    datetime(2013, 1, 3), datetime(2013, 1, 4),
                    datetime(2013, 1, 5)] * 4

    normal_grouped = normal_df.groupby('key')
    dt_grouped = dt_df.groupby(TimeGrouper(key='key', freq='D'))

    expected = getattr(normal_grouped, resample_method)()
    dt_result = getattr(dt_grouped, resample_method)()
    expected.index = date_range(start='2013-01-01', freq='D',
                                periods=5, name='key')
    tm.assert_equal(expected, dt_result)

    # if TimeGrouper is used included, 'nth' doesn't work yet

    """
コード例 #9
0
def test_deferred_with_groupby():

    # GH 12486
    # support deferred resample ops with groupby
    data = [['2010-01-01', 'A', 2], ['2010-01-02', 'A', 3],
            ['2010-01-05', 'A', 8], ['2010-01-10', 'A', 7],
            ['2010-01-13', 'A', 3], ['2010-01-01', 'B', 5],
            ['2010-01-03', 'B', 2], ['2010-01-04', 'B', 1],
            ['2010-01-11', 'B', 7], ['2010-01-14', 'B', 3]]

    df = DataFrame(data, columns=['date', 'id', 'score'])
    df.date = pd.to_datetime(df.date)

    def f(x):
        return x.set_index('date').resample('D').asfreq()
    expected = df.groupby('id').apply(f)
    result = df.set_index('date').groupby('id').resample('D').asfreq()
    assert_frame_equal(result, expected)

    df = DataFrame({'date': pd.date_range(start='2016-01-01',
                                          periods=4,
                                          freq='W'),
                    'group': [1, 1, 2, 2],
                    'val': [5, 6, 7, 8]}).set_index('date')

    def f(x):
        return x.resample('1D').ffill()
    expected = df.groupby('group').apply(f)
    result = df.groupby('group').resample('1D').ffill()
    assert_frame_equal(result, expected)
コード例 #10
0
def test_preserve_categories():
    # GH-13179
    categories = list('abc')

    # ordered=True
    df = DataFrame({'A': pd.Categorical(list('ba'),
                                        categories=categories,
                                        ordered=True)})
    index = pd.CategoricalIndex(categories, categories, ordered=True)
    tm.assert_index_equal(
        df.groupby('A', sort=True, observed=False).first().index, index)
    tm.assert_index_equal(
        df.groupby('A', sort=False, observed=False).first().index, index)

    # ordered=False
    df = DataFrame({'A': pd.Categorical(list('ba'),
                                        categories=categories,
                                        ordered=False)})
    sort_index = pd.CategoricalIndex(categories, categories, ordered=False)
    nosort_index = pd.CategoricalIndex(list('bac'), list('bac'),
                                       ordered=False)
    tm.assert_index_equal(
        df.groupby('A', sort=True, observed=False).first().index,
        sort_index)
    tm.assert_index_equal(
        df.groupby('A', sort=False, observed=False).first().index,
        nosort_index)
コード例 #11
0
ファイル: test_aggregate.py プロジェクト: tsdlovell/pandas
 def test_cython_agg_nothing_to_agg_with_dates(self):
     frame = DataFrame({'a': np.random.randint(0, 5, 50),
                        'b': ['foo', 'bar'] * 25,
                        'dates': pd.date_range('now', periods=50,
                                               freq='T')})
     with tm.assertRaisesRegexp(DataError, "No numeric types to aggregate"):
         frame.groupby('b').dates.mean()
コード例 #12
0
ファイル: test_rank.py プロジェクト: BranYang/pandas
def test_rank_object_raises(ties_method, ascending, na_option,
                            pct, vals):
    df = DataFrame({'key': ['foo'] * 5, 'val': vals})
    with tm.assert_raises_regex(TypeError, "not callable"):
        df.groupby('key').rank(method=ties_method,
                               ascending=ascending,
                               na_option=na_option, pct=pct)
コード例 #13
0
class LogAggregate:
    def __init__(self, dataset):
        self.dataset = DataFrame(dataset)

    def get_median(self, *arg, **kwarg):
        if kwarg.has_key('group_by'):
            return self.dataset.groupby(kwarg['group_by']).median()[kwarg['key']]
        else:
            return self.dataset.median()[kwarg['key']]

    def get_average(self, *arg, **kwarg):
        if kwarg.has_key('group_by'):
            return self.dataset.groupby(kwarg['group_by']).mean()[kwarg['key']]
        else:
            return self.dataset.mean()[kwarg['key']]

    def get_min(self, *arg, **kwarg):
        if kwarg.has_key('group_by'):
            return self.dataset.groupby(kwarg['group_by']).min()[kwarg['key']]
        else:
            return self.dataset.min()[kwarg['key']]
    
    def get_max(self, *arg, **kwarg):
        if kwarg.has_key('group_by'):
            return self.dataset.groupby(kwarg['group_by']).max()[kwarg['key']]
        else:
            return self.dataset.max()[kwarg['key']]

    def get_count(self, *arg, **kwarg):
        if kwarg.has_key('group_by'):
            return self.dataset.groupby(kwarg['group_by']).count()[kwarg['key']]
        else:
            return self.dataset.count()[kwarg['key']]
コード例 #14
0
ファイル: test_other.py プロジェクト: Michael-E-Rose/pandas
def test_agg_datetimes_mixed():
    data = [[1, '2012-01-01', 1.0],
            [2, '2012-01-02', 2.0],
            [3, None, 3.0]]

    df1 = DataFrame({'key': [x[0] for x in data],
                     'date': [x[1] for x in data],
                     'value': [x[2] for x in data]})

    data = [[row[0],
             (dt.datetime.strptime(row[1], '%Y-%m-%d').date()
              if row[1] else None),
             row[2]]
            for row in data]

    df2 = DataFrame({'key': [x[0] for x in data],
                     'date': [x[1] for x in data],
                     'value': [x[2] for x in data]})

    df1['weights'] = df1['value'] / df1['value'].sum()
    gb1 = df1.groupby('date').aggregate(np.sum)

    df2['weights'] = df1['value'] / df1['value'].sum()
    gb2 = df2.groupby('date').aggregate(np.sum)

    assert (len(gb1) == len(gb2))
コード例 #15
0
ファイル: test_grouping.py プロジェクト: bashtage/pandas
    def test_groupby_categorical_index_and_columns(self, observed):
        # GH18432, adapted for GH25871
        columns = ['A', 'B', 'A', 'B']
        categories = ['B', 'A']
        data = np.array([[1, 2, 1, 2],
                         [1, 2, 1, 2],
                         [1, 2, 1, 2],
                         [1, 2, 1, 2],
                         [1, 2, 1, 2]], int)
        cat_columns = CategoricalIndex(columns,
                                       categories=categories,
                                       ordered=True)
        df = DataFrame(data=data, columns=cat_columns)
        result = df.groupby(axis=1, level=0, observed=observed).sum()
        expected_data = np.array([[4, 2],
                                  [4, 2],
                                  [4, 2],
                                  [4, 2],
                                  [4, 2]], int)
        expected_columns = CategoricalIndex(categories,
                                            categories=categories,
                                            ordered=True)
        expected = DataFrame(data=expected_data, columns=expected_columns)
        assert_frame_equal(result, expected)

        # test transposed version
        df = DataFrame(data.T, index=cat_columns)
        result = df.groupby(axis=0, level=0, observed=observed).sum()
        expected = DataFrame(data=expected_data.T, index=expected_columns)
        assert_frame_equal(result, expected)
コード例 #16
0
ファイル: test_function.py プロジェクト: TomAugspurger/pandas
def test_ops_general():
    ops = [('mean', np.mean),
           ('median', np.median),
           ('std', np.std),
           ('var', np.var),
           ('sum', np.sum),
           ('prod', np.prod),
           ('min', np.min),
           ('max', np.max),
           ('first', lambda x: x.iloc[0]),
           ('last', lambda x: x.iloc[-1]),
           ('count', np.size), ]
    try:
        from scipy.stats import sem
    except ImportError:
        pass
    else:
        ops.append(('sem', sem))
    df = DataFrame(np.random.randn(1000))
    labels = np.random.randint(0, 50, size=1000).astype(float)

    for op, targop in ops:
        result = getattr(df.groupby(labels), op)().astype(float)
        expected = df.groupby(labels).agg(targop)
        try:
            tm.assert_frame_equal(result, expected)
        except BaseException as exc:
            exc.args += ('operation: %s' % op, )
            raise
コード例 #17
0
ファイル: test_grouping.py プロジェクト: Itay4/pandas
    def test_multi_iter_frame(self, three_group):
        k1 = np.array(['b', 'b', 'b', 'a', 'a', 'a'])
        k2 = np.array(['1', '2', '1', '2', '1', '2'])
        df = DataFrame({'v1': np.random.randn(6),
                        'v2': np.random.randn(6),
                        'k1': k1, 'k2': k2},
                       index=['one', 'two', 'three', 'four', 'five', 'six'])

        grouped = df.groupby(['k1', 'k2'])

        # things get sorted!
        iterated = list(grouped)
        idx = df.index
        expected = [('a', '1', df.loc[idx[[4]]]),
                    ('a', '2', df.loc[idx[[3, 5]]]),
                    ('b', '1', df.loc[idx[[0, 2]]]),
                    ('b', '2', df.loc[idx[[1]]])]
        for i, ((one, two), three) in enumerate(iterated):
            e1, e2, e3 = expected[i]
            assert e1 == one
            assert e2 == two
            assert_frame_equal(three, e3)

        # don't iterate through groups with no data
        df['k1'] = np.array(['b', 'b', 'b', 'a', 'a', 'a'])
        df['k2'] = np.array(['1', '1', '1', '2', '2', '2'])
        grouped = df.groupby(['k1', 'k2'])
        groups = {key: gp for key, gp in grouped}
        assert len(groups) == 2

        # axis = 1
        three_levels = three_group.groupby(['A', 'B', 'C']).mean()
        grouped = three_levels.T.groupby(axis=1, level=(1, 2))
        for key, group in grouped:
            pass
コード例 #18
0
ファイル: test_time_grouper.py プロジェクト: Itay4/pandas
def test_aggregate_with_nat(func, fill_value):
    # check TimeGrouper's aggregation is identical as normal groupby
    # if NaT is included, 'var', 'std', 'mean', 'first','last'
    # and 'nth' doesn't work yet

    n = 20
    data = np.random.randn(n, 4).astype('int64')
    normal_df = DataFrame(data, columns=['A', 'B', 'C', 'D'])
    normal_df['key'] = [1, 2, np.nan, 4, 5] * 4

    dt_df = DataFrame(data, columns=['A', 'B', 'C', 'D'])
    dt_df['key'] = [datetime(2013, 1, 1), datetime(2013, 1, 2), pd.NaT,
                    datetime(2013, 1, 4), datetime(2013, 1, 5)] * 4

    normal_grouped = normal_df.groupby('key')
    dt_grouped = dt_df.groupby(TimeGrouper(key='key', freq='D'))

    normal_result = getattr(normal_grouped, func)()
    dt_result = getattr(dt_grouped, func)()

    pad = DataFrame([[fill_value] * 4], index=[3],
                    columns=['A', 'B', 'C', 'D'])
    expected = normal_result.append(pad)
    expected = expected.sort_index()
    expected.index = date_range(start='2013-01-01', freq='D',
                                periods=5, name='key')
    assert_frame_equal(expected, dt_result)
    assert dt_result.index.name == 'key'
コード例 #19
0
ファイル: test_cython.py プロジェクト: DusanMilunovic/pandas
def test__cython_agg_general(op, targop):
    df = DataFrame(np.random.randn(1000))
    labels = np.random.randint(0, 50, size=1000).astype(float)

    result = df.groupby(labels)._cython_agg_general(op)
    expected = df.groupby(labels).agg(targop)
    tm.assert_frame_equal(result, expected)
コード例 #20
0
ファイル: test_rank.py プロジェクト: christlc/pandas
def test_rank_apply():
    lev1 = tm.rands_array(10, 100)
    lev2 = tm.rands_array(10, 130)
    lab1 = np.random.randint(0, 100, size=500)
    lab2 = np.random.randint(0, 130, size=500)

    df = DataFrame({'value': np.random.randn(500),
                    'key1': lev1.take(lab1),
                    'key2': lev2.take(lab2)})

    result = df.groupby(['key1', 'key2']).value.rank()

    expected = []
    for key, piece in df.groupby(['key1', 'key2']):
        expected.append(piece.value.rank())
    expected = concat(expected, axis=0)
    expected = expected.reindex(result.index)
    tm.assert_series_equal(result, expected)

    result = df.groupby(['key1', 'key2']).value.rank(pct=True)

    expected = []
    for key, piece in df.groupby(['key1', 'key2']):
        expected.append(piece.value.rank(pct=True))
    expected = concat(expected, axis=0)
    expected = expected.reindex(result.index)
    tm.assert_series_equal(result, expected)
コード例 #21
0
ファイル: test_cython.py プロジェクト: DusanMilunovic/pandas
def test_cython_agg_nothing_to_agg_with_dates():
    frame = DataFrame({'a': np.random.randint(0, 5, 50),
                       'b': ['foo', 'bar'] * 25,
                       'dates': pd.date_range('now', periods=50, freq='T')})
    msg = "No numeric types to aggregate"
    with pytest.raises(DataError, match=msg):
        frame.groupby('b').dates.mean()
コード例 #22
0
ファイル: groupby.py プロジェクト: Itay4/pandas
    def setup(self, dtype, method, application):
        if method in method_blacklist.get(dtype, {}):
            raise NotImplementedError  # skip benchmark
        ngroups = 1000
        size = ngroups * 2
        rng = np.arange(ngroups)
        values = rng.take(np.random.randint(0, ngroups, size=size))
        if dtype == 'int':
            key = np.random.randint(0, size, size=size)
        elif dtype == 'float':
            key = np.concatenate([np.random.random(ngroups) * 0.1,
                                  np.random.random(ngroups) * 10.0])
        elif dtype == 'object':
            key = ['foo'] * size
        elif dtype == 'datetime':
            key = date_range('1/1/2011', periods=size, freq='s')

        df = DataFrame({'values': values, 'key': key})

        if application == 'transform':
            if method == 'describe':
                raise NotImplementedError

            self.as_group_method = lambda: df.groupby(
                'key')['values'].transform(method)
            self.as_field_method = lambda: df.groupby(
                'values')['key'].transform(method)
        else:
            self.as_group_method = getattr(df.groupby('key')['values'], method)
            self.as_field_method = getattr(df.groupby('values')['key'], method)
コード例 #23
0
ファイル: test_pivot.py プロジェクト: ChristopherShort/pandas
    def test_crosstab_margins(self):
        a = np.random.randint(0, 7, size=100)
        b = np.random.randint(0, 3, size=100)
        c = np.random.randint(0, 5, size=100)

        df = DataFrame({"a": a, "b": b, "c": c})

        result = crosstab(a, [b, c], rownames=["a"], colnames=("b", "c"), margins=True)

        self.assertEqual(result.index.names, ("a",))
        self.assertEqual(result.columns.names, ["b", "c"])

        all_cols = result["All", ""]
        exp_cols = df.groupby(["a"]).size().astype("i8")
        exp_cols = exp_cols.append(Series([len(df)], index=["All"]))
        exp_cols.name = ("All", "")

        tm.assert_series_equal(all_cols, exp_cols)

        all_rows = result.ix["All"]
        exp_rows = df.groupby(["b", "c"]).size().astype("i8")
        exp_rows = exp_rows.append(Series([len(df)], index=[("All", "")]))
        exp_rows.name = "All"

        exp_rows = exp_rows.reindex(all_rows.index)
        exp_rows = exp_rows.fillna(0).astype(np.int64)
        tm.assert_series_equal(all_rows, exp_rows)
コード例 #24
0
ファイル: test_timegrouper.py プロジェクト: sinhrks/pandas
    def test_timegrouper_with_reg_groups_freq(self, freq):
        # GH 6764 multiple grouping with/without sort
        df = DataFrame({
            'date': pd.to_datetime([
                '20121002', '20121007', '20130130', '20130202', '20130305',
                '20121002', '20121207', '20130130', '20130202', '20130305',
                '20130202', '20130305'
            ]),
            'user_id': [1, 1, 1, 1, 1, 3, 3, 3, 5, 5, 5, 5],
            'whole_cost': [1790, 364, 280, 259, 201, 623, 90, 312, 359, 301,
                           359, 801],
            'cost1': [12, 15, 10, 24, 39, 1, 0, 90, 45, 34, 1, 12]
        }).set_index('date')

        expected = (
            df.groupby('user_id')['whole_cost']
              .resample(freq)
              .sum(min_count=1)  # XXX
              .dropna()
              .reorder_levels(['date', 'user_id'])
              .sort_index()
              .astype('int64')
        )
        expected.name = 'whole_cost'

        result1 = df.sort_index().groupby([pd.Grouper(freq=freq),
                                           'user_id'])['whole_cost'].sum()
        assert_series_equal(result1, expected)

        result2 = df.groupby([pd.Grouper(freq=freq), 'user_id'])[
            'whole_cost'].sum()
        assert_series_equal(result2, expected)
コード例 #25
0
ファイル: test_multilevel.py プロジェクト: smc77/pandas
 def test_groupby_corner(self):
     midx = MultiIndex(levels=[['foo'],['bar'],['baz']],
                       labels=[[0],[0],[0]], names=['one','two','three'])
     df = DataFrame([np.random.rand(4)], columns=['a','b','c','d'],
                    index=midx)
     # should work
     df.groupby(level='three')
コード例 #26
0
ファイル: test_grouping.py プロジェクト: TomAugspurger/pandas
    def test_groupby_categorical_index_and_columns(self, observed):
        # GH18432
        columns = ['A', 'B', 'A', 'B']
        categories = ['B', 'A']
        data = np.ones((5, 4), int)
        cat_columns = CategoricalIndex(columns,
                                       categories=categories,
                                       ordered=True)
        df = DataFrame(data=data, columns=cat_columns)
        result = df.groupby(axis=1, level=0, observed=observed).sum()
        expected_data = 2 * np.ones((5, 2), int)

        if observed:
            # if we are not-observed we undergo a reindex
            # so need to adjust the output as our expected sets us up
            # to be non-observed
            expected_columns = CategoricalIndex(['A', 'B'],
                                                categories=categories,
                                                ordered=True)
        else:
            expected_columns = CategoricalIndex(categories,
                                                categories=categories,
                                                ordered=True)
        expected = DataFrame(data=expected_data, columns=expected_columns)
        assert_frame_equal(result, expected)

        # test transposed version
        df = DataFrame(data.T, index=cat_columns)
        result = df.groupby(axis=0, level=0, observed=observed).sum()
        expected = DataFrame(data=expected_data.T, index=expected_columns)
        assert_frame_equal(result, expected)
コード例 #27
0
ファイル: test_apply.py プロジェクト: bashtage/pandas
def test_groupby_as_index_apply(df):
    # GH #4648 and #3417
    df = DataFrame({'item_id': ['b', 'b', 'a', 'c', 'a', 'b'],
                    'user_id': [1, 2, 1, 1, 3, 1],
                    'time': range(6)})

    g_as = df.groupby('user_id', as_index=True)
    g_not_as = df.groupby('user_id', as_index=False)

    res_as = g_as.head(2).index
    res_not_as = g_not_as.head(2).index
    exp = Index([0, 1, 2, 4])
    tm.assert_index_equal(res_as, exp)
    tm.assert_index_equal(res_not_as, exp)

    res_as_apply = g_as.apply(lambda x: x.head(2)).index
    res_not_as_apply = g_not_as.apply(lambda x: x.head(2)).index

    # apply doesn't maintain the original ordering
    # changed in GH5610 as the as_index=False returns a MI here
    exp_not_as_apply = MultiIndex.from_tuples([(0, 0), (0, 2), (1, 1), (
        2, 4)])
    tp = [(1, 0), (1, 2), (2, 1), (3, 4)]
    exp_as_apply = MultiIndex.from_tuples(tp, names=['user_id', None])

    tm.assert_index_equal(res_as_apply, exp_as_apply)
    tm.assert_index_equal(res_not_as_apply, exp_not_as_apply)

    ind = Index(list('abcde'))
    df = DataFrame([[1, 2], [2, 3], [1, 4], [1, 5], [2, 6]], index=ind)
    res = df.groupby(0, as_index=False).apply(lambda x: x).index
    tm.assert_index_equal(res, ind)
コード例 #28
0
ファイル: test_functional.py プロジェクト: MasonGallo/pandas
    def test_size(self):
        grouped = self.df.groupby(['A', 'B'])
        result = grouped.size()
        for key, group in grouped:
            assert result[key] == len(group)

        grouped = self.df.groupby('A')
        result = grouped.size()
        for key, group in grouped:
            assert result[key] == len(group)

        grouped = self.df.groupby('B')
        result = grouped.size()
        for key, group in grouped:
            assert result[key] == len(group)

        df = DataFrame(np.random.choice(20, (1000, 3)), columns=list('abc'))
        for sort, key in cart_product((False, True), ('a', 'b', ['a', 'b'])):
            left = df.groupby(key, sort=sort).size()
            right = df.groupby(key, sort=sort)['c'].apply(lambda a: a.shape[0])
            assert_series_equal(left, right, check_names=False)

        # GH11699
        df = DataFrame([], columns=['A', 'B'])
        out = Series([], dtype='int64', index=Index([], name='A'))
        assert_series_equal(df.groupby('A').size(), out)
コード例 #29
0
ファイル: test_aggregate.py プロジェクト: cpcloud/pandas
    def test_cython_agg_boolean(self):
        frame = DataFrame({'a': np.random.randint(0, 5, 50),
                           'b': np.random.randint(0, 2, 50).astype('bool')})
        result = frame.groupby('a')['b'].mean()
        expected = frame.groupby('a')['b'].agg(np.mean)

        assert_series_equal(result, expected)
コード例 #30
0
ファイル: test_timegrouper.py プロジェクト: sinhrks/pandas
 def test_groupby_max_datetime64(self):
     # GH 5869
     # datetimelike dtype conversion from int
     df = DataFrame(dict(A=Timestamp('20130101'), B=np.arange(5)))
     expected = df.groupby('A')['A'].apply(lambda x: x.max())
     result = df.groupby('A')['A'].max()
     assert_series_equal(result, expected)
コード例 #31
0
    def test_multiindex_columns_empty_level(self):
        lst = [['count', 'values'], ['to filter', '']]
        midx = MultiIndex.from_tuples(lst)

        df = DataFrame([[1, 'A']], columns=midx)

        grouped = df.groupby('to filter').groups
        assert grouped['A'] == [0]

        grouped = df.groupby([('to filter', '')]).groups
        assert grouped['A'] == [0]

        df = DataFrame([[1, 'A'], [2, 'B']], columns=midx)

        expected = df.groupby('to filter').groups
        result = df.groupby([('to filter', '')]).groups
        assert result == expected

        df = DataFrame([[1, 'A'], [2, 'A']], columns=midx)

        expected = df.groupby('to filter').groups
        result = df.groupby([('to filter', '')]).groups
        tm.assert_dict_equal(result, expected)
コード例 #32
0
    def test_multiindex_columns_empty_level(self):
        lst = [["count", "values"], ["to filter", ""]]
        midx = MultiIndex.from_tuples(lst)

        df = DataFrame([[1, "A"]], columns=midx)

        grouped = df.groupby("to filter").groups
        assert grouped["A"] == [0]

        grouped = df.groupby([("to filter", "")]).groups
        assert grouped["A"] == [0]

        df = DataFrame([[1, "A"], [2, "B"]], columns=midx)

        expected = df.groupby("to filter").groups
        result = df.groupby([("to filter", "")]).groups
        assert result == expected

        df = DataFrame([[1, "A"], [2, "A"]], columns=midx)

        expected = df.groupby("to filter").groups
        result = df.groupby([("to filter", "")]).groups
        tm.assert_dict_equal(result, expected)
コード例 #33
0
class Nth:

    param_names = ["dtype"]
    params = ["float32", "float64", "datetime", "object"]

    def setup(self, dtype):
        N = 10**5
        # with datetimes (GH7555)
        if dtype == "datetime":
            values = date_range("1/1/2011", periods=N, freq="s")
        elif dtype == "object":
            values = ["foo"] * N
        else:
            values = np.arange(N).astype(dtype)

        key = np.arange(N)
        self.df = DataFrame({"key": key, "values": values})
        self.df.iloc[1, 1] = np.nan  # insert missing data

    def time_frame_nth_any(self, dtype):
        self.df.groupby("key").nth(0, dropna="any")

    def time_groupby_nth_all(self, dtype):
        self.df.groupby("key").nth(0, dropna="all")

    def time_frame_nth(self, dtype):
        self.df.groupby("key").nth(0)

    def time_series_nth_any(self, dtype):
        self.df["values"].groupby(self.df["key"]).nth(0, dropna="any")

    def time_series_nth_all(self, dtype):
        self.df["values"].groupby(self.df["key"]).nth(0, dropna="all")

    def time_series_nth(self, dtype):
        self.df["values"].groupby(self.df["key"]).nth(0)
コード例 #34
0
 def test_mangled(self):
     df = DataFrame({"A": [0, 1], "B": [1, 2], "C": [3, 4]})
     result = df.groupby("A").agg(b=("B", lambda x: 0), c=("C", lambda x: 1))
     expected = DataFrame({"b": [0, 0], "c": [1, 1]}, index=Index([0, 1], name="A"))
     tm.assert_frame_equal(result, expected)
コード例 #35
0
class CocoModel(object):
    def __init__(self, path):
        super(CocoModel, self).__init__()
        self.path = path
        self.coco = COCO(self.path)
        self.annotations = self.coco.loadAnns(self.coco.getAnnIds())
        self.df = DataFrame(self.annotations)
        self.imgs_ids = None
        self.catgs_ids = None
        self.catgs_names = None
        self.cooc_matrix = None

    def images_ids(self):
        if not self.imgs_ids:
            self.imgs_ids = list(self.df['image_id'].unique())
        return self.imgs_ids

    def categories_ids(self):
        if not self.catgs_ids:
            self.catgs_ids = list(self.df['category_id'].unique())
        return self.catgs_ids

    def categories_names(self):
        if not self.catgs_names:
            catgs = self.coco.loadCats(self.categories_ids())
            self.catgs_names = [c['name'].replace(' ', '_') for c in catgs]
        return self.catgs_names

    def max_objects_per_image(self):
        return self.df.groupby('image_id').size().max()

    def get_image_categories(self, img_id):
        return list(self.df[self.df.image_id == img_id]['category_id'])

    def get_category_name(self, catg_id):
        return self.categories_names()[self.categories_ids().index(catg_id)]

    def get_category_id_by_name(self, catg_name):
        if catg_name == "nop":
            return -1
        return self.categories_ids()[self.categories_names().index(catg_name)]

    def cooccurrence_matrix(self):
        if self.cooc_matrix is None:
            self.cooc_matrix = np.zeros(
                (len(self.categories_ids()), len(self.categories_ids())),
                dtype=np.int32)
            for img in self.images_ids():
                catgs_img = self.get_image_categories(img)
                for i, c_id in enumerate(catgs_img):
                    i_id = self.categories_ids().index(c_id)
                    for j in range(i + 1, len(catgs_img)):
                        j_id = self.categories_ids().index(catgs_img[j])
                        self.cooc_matrix[i_id, j_id] += 1
            np.fill_diagonal(self.cooc_matrix, 0)
        return self.cooc_matrix

    def topn_coocurrences(self, catg_id, n=10):
        idx = self.categories_ids().index(catg_id)
        # Get the catg_id equivalent cooc_matrix row
        # Get the indices that would sort this row
        # Reverse the indices array to get coocurrences in descending order
        # Get only the n most coocurring categories indices
        # Map the indices to the categories id and return them
        sort_row = np.argsort(self.cooccurrence_matrix()[idx, :])[::-1][:n]
        return [self.categories_ids()[i] for i in sort_row]
コード例 #36
0
    def analysedata():
        # 从上面生成的pickle文件中读取字典里的数据
        pkl_file = open('analyse.pkl', 'rb')
        dic_dystany = pickle.load(pkl_file)
        pkl_file.close()

        # 处理stop words
        stp_file = codecs.open('gswstpwrds.txt', 'r', 'utf8')
        stop_words = stp_file.read()
        stp_file.close()
        stop_words = stop_words.split('\n')
        stop_words = stop_words + [
            u'\r\n',
            u'...',
            u'\r\r',
            u'so.gushiwen.org',
            u'佚名\r',
        ]
        stop_words = Series(stop_words)

        # dy_fanyi_list = [dic_dystany[dy]['content_fanyi_merge'] for dy in dic_dystany.keys()]
        dy_list = [
            u'先秦',
            u'两汉',
            u'魏晋',
            u'南北朝',
            u'隋代',
            u'唐代',
            u'五代',
            u'宋代',
            u'金朝',
            u'元代',
            u'明代',
            u'清代',
        ]
        # 先秦、唐代、宋代、清代、元代占了全部内容的五分之四,故暂时只考虑这五个朝代
        # dy_list = [u'先秦', u'唐代', u'宋代', u'元代', u'清代']
        # dy_list = [u'金朝']

        # 整合所有翻译文件,用作训练源数据
        # for dystany in dy_list:
        #     filename = dystany + 'fanyi.txt'
        #     merger_file = codecs.open(filename, 'w', 'utf8')
        #     fy_content = dic_dystany[dystany]['content_fanyi_merge']
        #     fy_content = fy_content.replace('\r', '\n').replace('\n\n', '\n').replace('\n\n', '\n')
        #     fy_content = fy_content.replace(u'\u3000\u3000\n', '').replace(u'\u3000\n', '')
        #     merger_file.write(fy_content)
        #     merger_file.close()

        thul = thulac.thulac('-seg_only')
        thul.run()

        writer = ExcelWriter('gushiwen.xlsx')
        for dystany in dy_list:
            fy_content = dic_dystany[dystany]['content_fanyi_merge']
            ls = []
            while len(fy_content) > 10000:
                con = fy_content[:10000]
                fy_content = fy_content[10000:]
                ls = ls + thul.cut(con.encode('utf8'))
            if fy_content:
                ls += thul.cut(fy_content.encode('utf8'))
            fy_cont_seg = [val for val in ls
                           if len(val) > 3]  # 剔除所有单字符,thulac返回的是str,str长度为3
            print type(fy_cont_seg), ' ', len(fy_cont_seg), ' ', dystany
            fy_cont_seg = [val.decode('utf8') for val in fy_cont_seg]
            fy_cont_seg_df = DataFrame({'segment': fy_cont_seg})
            fy_cont_seg_df = fy_cont_seg_df[~fy_cont_seg_df.segment.
                                            isin(stop_words)]
            segtat = fy_cont_seg_df.groupby(by=['segment'])['segment'].agg({
                'count':
                numpy.size
            }).reset_index().sort_values(by=['count'], ascending=False)
            segtat.to_excel(writer, dystany)
            writer.save()
            print 'End of dystany : ', dystany
        writer.close()
コード例 #37
0
def test_observed(observed):
    # multiple groupers, don't re-expand the output space
    # of the grouper
    # gh-14942 (implement)
    # gh-10132 (back-compat)
    # gh-8138 (back-compat)
    # gh-8869

    cat1 = Categorical(["a", "a", "b", "b"],
                       categories=["a", "b", "z"],
                       ordered=True)
    cat2 = Categorical(["c", "d", "c", "d"],
                       categories=["c", "d", "y"],
                       ordered=True)
    df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]})
    df['C'] = ['foo', 'bar'] * 2

    # multiple groupers with a non-cat
    gb = df.groupby(['A', 'B', 'C'], observed=observed)
    exp_index = pd.MultiIndex.from_arrays([cat1, cat2, ['foo', 'bar'] * 2],
                                          names=['A', 'B', 'C'])
    expected = DataFrame({
        'values': Series([1, 2, 3, 4], index=exp_index)
    }).sort_index()
    result = gb.sum()
    if not observed:
        expected = cartesian_product_for_groupers(expected,
                                                  [cat1, cat2, ['foo', 'bar']],
                                                  list('ABC'))

    tm.assert_frame_equal(result, expected)

    gb = df.groupby(['A', 'B'], observed=observed)
    exp_index = pd.MultiIndex.from_arrays([cat1, cat2], names=['A', 'B'])
    expected = DataFrame({'values': [1, 2, 3, 4]}, index=exp_index)
    result = gb.sum()
    if not observed:
        expected = cartesian_product_for_groupers(expected, [cat1, cat2],
                                                  list('AB'))

    tm.assert_frame_equal(result, expected)

    # https://github.com/pandas-dev/pandas/issues/8138
    d = {
        'cat':
        pd.Categorical(["a", "b", "a", "b"],
                       categories=["a", "b", "c"],
                       ordered=True),
        'ints': [1, 1, 2, 2],
        'val': [10, 20, 30, 40]
    }
    df = pd.DataFrame(d)

    # Grouping on a single column
    groups_single_key = df.groupby("cat", observed=observed)
    result = groups_single_key.mean()

    exp_index = pd.CategoricalIndex(list('ab'),
                                    name="cat",
                                    categories=list('abc'),
                                    ordered=True)
    expected = DataFrame({
        "ints": [1.5, 1.5],
        "val": [20., 30]
    },
                         index=exp_index)
    if not observed:
        index = pd.CategoricalIndex(list('abc'),
                                    name="cat",
                                    categories=list('abc'),
                                    ordered=True)
        expected = expected.reindex(index)

    tm.assert_frame_equal(result, expected)

    # Grouping on two columns
    groups_double_key = df.groupby(["cat", "ints"], observed=observed)
    result = groups_double_key.agg('mean')
    expected = DataFrame({
        "val": [10, 30, 20, 40],
        "cat":
        pd.Categorical(['a', 'a', 'b', 'b'],
                       categories=['a', 'b', 'c'],
                       ordered=True),
        "ints": [1, 2, 1, 2]
    }).set_index(["cat", "ints"])
    if not observed:
        expected = cartesian_product_for_groupers(expected,
                                                  [df.cat.values, [1, 2]],
                                                  ['cat', 'ints'])

    tm.assert_frame_equal(result, expected)

    # GH 10132
    for key in [('a', 1), ('b', 2), ('b', 1), ('a', 2)]:
        c, i = key
        result = groups_double_key.get_group(key)
        expected = df[(df.cat == c) & (df.ints == i)]
        assert_frame_equal(result, expected)

    # gh-8869
    # with as_index
    d = {
        'foo': [10, 8, 4, 8, 4, 1, 1],
        'bar': [10, 20, 30, 40, 50, 60, 70],
        'baz': ['d', 'c', 'e', 'a', 'a', 'd', 'c']
    }
    df = pd.DataFrame(d)
    cat = pd.cut(df['foo'], np.linspace(0, 10, 3))
    df['range'] = cat
    groups = df.groupby(['range', 'baz'], as_index=False, observed=observed)
    result = groups.agg('mean')

    groups2 = df.groupby(['range', 'baz'], as_index=True, observed=observed)
    expected = groups2.agg('mean').reset_index()
    tm.assert_frame_equal(result, expected)
コード例 #38
0
def transform(df: pandas.DataFrame) -> pandas.DataFrame:
    df = _apply_transform_list(df, TRANSFORMERS)
    return df.groupby('uuid').apply(_group_transform_df)
コード例 #39
0
    def portfolio_cross_section(
            self,
            pipeline_df: pd.DataFrame,
            allocation_method: PortfolioAllocationModel = ValueWeightedPortfolio
    ):
        """
        We cross-split the portfolio based on factors and percentiles, and pick the
        resulting portfolios to go long and short on. For example:
        +-----------------------------------------------------------+
        |                       |           Median ME               |
        +-----------------------+-----------------------------------|

                                                |
                                Small Value     |   Big Value
        70th BE/ME Percentile ------------------|-------------
                                Small Neutral   |   Big Neutral
        30th BE/ME Percentile ------------------|--------------
                                Small Growth    |   Big Growth
        :param pipeline_df
        :return:
        """
        def split_stocks_quantile(df: pd.DataFrame, factor: str):
            factor_rank = None
            for factor_ in self.factors:
                if factor_.__class__.__name__ == factor:
                    factor_rank = factor_.rank_split

            bottom_quantile = df[
                df[factor] <= df[factor].quantile(factor_rank[0])]
            bottom_quantile_stocks = [
                stock for date, stock in bottom_quantile.index.values
            ]

            top_quantile = df[
                df[factor] >= df[factor].quantile(factor_rank[1])]
            top_quantile_stocks = [
                stock for date, stock in top_quantile.index.values
            ]

            return bottom_quantile_stocks, top_quantile_stocks

        def select_stocks(group):
            returns_df = pd.DataFrame()
            from_date = group.index.values[0][0]
            from_date_idx = group.index.levels[0].to_list().index(from_date)
            try:
                # TODO think about inclusive at rebalancing day
                to_date = group.index.levels[0][from_date_idx +
                                                1] - timedelta(days=1)
            except:
                to_date = self.end_date
            if 'SMB' in group.columns:
                # Do the Fama French / AQR Way
                small_size_stocks, large_size_stocks = split_stocks_quantile(
                    df=group, factor='SMB')

                for factor in group.columns:
                    if factor != 'SMB':
                        bottom_quantile_stocks, top_quantile_stocks = split_stocks_quantile(
                            df=group, factor=factor)

                        small_top_stocks = set.intersection(
                            set(small_size_stocks), set(top_quantile_stocks))
                        big_top_stocks = set.intersection(
                            set(large_size_stocks), set(top_quantile_stocks))

                        small_bottom_stocks = set.intersection(
                            set(small_size_stocks),
                            set(bottom_quantile_stocks))
                        big_bottom_stocks = set.intersection(
                            set(large_size_stocks),
                            set(bottom_quantile_stocks))

                        cross_section_returns = {
                            name: {}
                            for name in [
                                'Small Top', 'Big Top', 'Small Bottom',
                                'Big Bottom'
                            ]
                        }

                        for name, stocks in zip([
                                'Small Top', 'Big Top', 'Small Bottom',
                                'Big Bottom'
                        ], [
                                small_top_stocks, big_top_stocks,
                                small_bottom_stocks, big_bottom_stocks
                        ]):
                            if len(stocks) > 0:
                                portfolio = self.asset_returns[stocks]
                                # To allocate weight, need history of returns up to now
                                weights = allocation_method(
                                    Portfolio(portfolio.loc[:from_date])
                                ).solve_weights()
                                cross_section_returns[name]['Weight Allocation'] \
                                    = [(stock, weight) for stock, weight in zip(portfolio.columns, weights)]

                                returns = np.sum(
                                    weights * portfolio.loc[from_date:to_date],
                                    axis=1)
                                cross_section_returns[name][
                                    'Returns'] = returns
                            else:
                                dates = pd.date_range(
                                    start=from_date + timedelta(days=1) -
                                    timedelta(seconds=1),
                                    end=to_date + timedelta(days=1) -
                                    timedelta(seconds=1)).to_list()
                                cross_section_returns[name][
                                    'Returns'] = pd.Series(
                                        np.zeros((to_date - from_date).days +
                                                 1),
                                        index=dates)
                                cross_section_returns[name][
                                    'Weight Allocation'] = [('', 0)]
                        # HML = 1/2 (Small Value + Big Value) - 1/2 (Small Growth + Big Growth).
                        long_stocks = small_top_stocks | big_top_stocks
                        short_stocks = small_bottom_stocks | big_bottom_stocks

                        for factor_ in self.factors:
                            if factor_.__class__.__name__ == factor:
                                # TODO
                                df = pd.DataFrame(
                                    columns=['Long Stocks', 'Short Stocks'],
                                    data=0)
                                factor_.holdings.append()

                        returns = 0.5 * (cross_section_returns['Small Top']['Returns'].add(
                            cross_section_returns['Big Top']['Returns'], fill_value=0)) \
                                  - 0.5 * (cross_section_returns['Small Bottom']['Returns'].add(
                            cross_section_returns['Big Bottom']['Returns'], fill_value=0))
                        returns.name = factor
                        returns_df = returns_df.join(
                            [returns], how='inner'
                        ) if not returns_df.empty else returns.to_frame()
            for factor, returns in returns_df.iteritems():
                factor_obj = None
                for f_ in self.factors:
                    if f_.__class__.__name__ == factor:
                        factor_obj = f_
                factor_obj.returns = returns
                # factor_obj.holdings =
            return returns_df

        factor_returns = pipeline_df.groupby(level=0,
                                             axis=0).apply(select_stocks)
        factor_returns.index = factor_returns.index.droplevel(0)
        return factor_returns
コード例 #40
0
    for i in friends:
        value = i[var]
        variable.append(value)
    return variable

#use get_var to get information
NickName = get_var('NickName')
Province = get_var('Province')
print(NickName, Province)
data = {
    'NickName':NickName,
'Province':Province
}
frame = DataFrame(data)#save data
print(frame)
#data chuli
#data groupby
aggResult = frame.groupby(
    by=['Province']
)['NickName'].agg({'人数':numpy.size,})
print(aggResult)
#transform data type

aggResult['好友数'] = aggResult.人数.astype(int)
aggResult['地区'] = aggResult.index
#data standard
# new data = (raw data-min) /(max - min)
aggResult['scala'] = (
aggResult.好友数 - aggResult.好友数.min()
) / (aggResult.好友数.max() - aggResult.好友数.min())
print(aggResult['好友数'], aggResult['地区'], aggResult['scala'])
コード例 #41
0
def add_realization_traces(data_frame: pd.DataFrame, color_by: str,
                           colors: Dict[str,
                                        List[str]], phase: str) -> List[dict]:
    """Renders line traces for individual realizations"""
    # pylint: disable-msg=too-many-locals

    traces = []

    data_frame = data_frame.loc[data_frame["KEYWORD"] == PvtPlot.PHASES[phase]]
    column_name = "GOR"

    border_value_pressure: Dict[str, list] = {}
    border_value_viscosity: Dict[str, list] = {}
    border_value_volumefactor: Dict[str, list] = {}
    constant_group = (data_frame["PVTNUM"].iloc[0] if color_by == "ENSEMBLE"
                      else data_frame["ENSEMBLE"].iloc[0])

    for (group, grouped_data_frame) in data_frame.groupby(color_by):
        for ratio_no, gas_oil_ratio in enumerate(
                grouped_data_frame[column_name].unique()):
            for realization_no, (realization,
                                 realization_data_frame) in enumerate(
                                     grouped_data_frame.groupby("REAL")):
                if group not in border_value_pressure:
                    border_value_pressure[group] = []
                    border_value_viscosity[group] = []
                    border_value_volumefactor[group] = []
                try:
                    border_value_pressure[group].append(
                        realization_data_frame.loc[
                            realization_data_frame[column_name] ==
                            gas_oil_ratio]["PRESSURE"].iloc[0])
                    border_value_volumefactor[group].append(
                        realization_data_frame.loc[
                            realization_data_frame[column_name] ==
                            gas_oil_ratio]["VOLUMEFACTOR"].iloc[0])
                    border_value_viscosity[group].append(
                        realization_data_frame.loc[
                            realization_data_frame[column_name] ==
                            gas_oil_ratio]["VISCOSITY"].iloc[0])
                except IndexError as exc:
                    raise IndexError(
                        "This error is most likely due to PVT differences between "
                        "realizations within the same ensemble. This is currently not "
                        "supported.") from exc

                traces.extend([{
                    "type":
                    "scatter",
                    "x":
                    realization_data_frame.loc[
                        realization_data_frame[column_name] ==
                        gas_oil_ratio]["PRESSURE"],
                    "y":
                    realization_data_frame.loc[
                        realization_data_frame[column_name] == gas_oil_ratio]
                    ["VOLUMEFACTOR"],
                    "xaxis":
                    "x",
                    "yaxis":
                    "y",
                    "hovertext":
                    (f"{'Rs' if phase == 'OIL' else 'Rv'} = {gas_oil_ratio}"
                     ", Pvtnum: "
                     f"{group if color_by == 'PVTNUM' else constant_group}<br>"
                     f"Realization: {realization}, Ensemble: "
                     f"{group if color_by == 'ENSEMBLE' else constant_group}"),
                    "name":
                    group,
                    "legendgroup":
                    group,
                    "marker": {
                        "color": colors.get(group,
                                            colors[list(colors.keys())[-1]])
                    },
                    "showlegend":
                    realization_no == 0 and ratio_no == 0,
                }])
                traces.extend([{
                    "type":
                    "scatter",
                    "x":
                    realization_data_frame.loc[
                        realization_data_frame[column_name] ==
                        gas_oil_ratio]["PRESSURE"],
                    "y":
                    realization_data_frame.loc[
                        realization_data_frame[column_name] == gas_oil_ratio]
                    ["VISCOSITY"],
                    "xaxis":
                    "x2",
                    "yaxis":
                    "y2",
                    "hovertext":
                    (f"{'Rs' if phase == 'OIL' else 'Rv'} = {gas_oil_ratio}"
                     ", Pvtnum: "
                     f"{group if color_by == 'PVTNUM' else constant_group}<br>"
                     f"Realization: {realization}, Ensemble: "
                     f"{group if color_by == 'ENSEMBLE' else constant_group}"),
                    "name":
                    group,
                    "legendgroup":
                    group,
                    "marker": {
                        "color": colors.get(group,
                                            colors[list(colors.keys())[-1]])
                    },
                    "showlegend":
                    False,
                }])

    for group in border_value_pressure:
        traces.extend([{
            "type": "scatter",
            "mode": "lines",
            "x": border_value_pressure[group],
            "y": border_value_volumefactor[group],
            "xaxis": "x",
            "yaxis": "y",
            "line": {
                "width": 1,
                "color": colors.get(group, colors[list(colors.keys())[-1]]),
            },
            "showlegend": False,
        }])
        traces.extend([{
            "type": "scatter",
            "mode": "lines",
            "x": border_value_pressure[group],
            "y": border_value_viscosity[group],
            "xaxis": "x2",
            "yaxis": "y2",
            "line": {
                "width": 1,
                "color": colors.get(group, colors[list(colors.keys())[-1]]),
            },
            "showlegend": False,
        }])
    return traces
コード例 #42
0
def performance_scaling(data: pd.DataFrame,
                        set_axes_limits: bool=True,
                        plot_regression: bool=True) -> (plt.Figure, plt.Axes):
    """
    Parameters
    ----------
    data : pd.DataFrame with 6 columns:
        "year",
        "performance",
        "kind" ∈ ["compute", "memory", "interconnect"],
        "name" (label shown in the plot, it can be empty),
        "base" (base value used for speedup, it can be empty),
        "comment" (e.g. data source or non-used label, it can be empty).

    Returns
    -------
    fig : matplotlib figure containing the plot
    ax : matplotlib axis containing the plot
    """
    
    ##############
    # Plot setup #
    ##############
    
    # Reset matplotlib settings;
    plt.rcdefaults()
    # Setup general plotting settings;
    sns.set_style("white", {"ytick.left": True, "xtick.bottom": True})
    plt.rcParams["font.family"] = ["Latin Modern Roman Demi"]
    plt.rcParams['axes.labelpad'] = 0  # Padding between axis and axis label;
    plt.rcParams['xtick.major.pad'] = 1  # Padding between axis ticks and tick labels;
    plt.rcParams['ytick.major.pad'] = 1  # Padding between axis ticks and tick labels;
    plt.rcParams['axes.linewidth'] = 0.8  # Line width of the axis borders;
    
    # Create a figure for the plot, and adjust margins;
    fig = plt.figure(figsize=(6, 2.5))
    gs = gridspec.GridSpec(1, 1)
    plt.subplots_adjust(top=0.98,
                        bottom=0.1,
                        left=0.12,
                        right=0.99)  
    ax = fig.add_subplot(gs[0, 0])
    
    # Set axes limits;        
    if set_axes_limits:
        ax.set_xlim(X_LIMITS)
        ax.set_ylim(Y_LIMITS)
    
    #################
    # Main plot #####
    #################    

    # Measure performance increase over 20 and 2 years;
    kind_increase = {}      
    
    # Add a scatterplot for individual elements of the dataset, and change color based on hardware type;
    ax = sns.scatterplot(x="year", y="performance", hue="kind", style="kind", palette=PALETTE, markers=MARKERS, s=15,
                      data=data, ax=ax, edgecolor="#2f2f2f", linewidth=0.5, zorder=4)

    # Add a regression plot to highlight the correlation between variables, with 95% confidence intervals;
    if plot_regression:
        for i, (kind, g) in enumerate(data.groupby("kind", sort=False)):            
            data_tmp = g.copy()
            # We fit a straight line on the log of the relative performance, as the scaling is exponential.
            # Then, the real prediction is 10**prediction;
            regr = linear_model.LinearRegression()
            regr.fit(data_tmp["year"].values.reshape(-1, 1), np.log10(data_tmp["performance"].values.reshape(-1, 1)))
            data_tmp["prediction"] = np.power(10, regr.predict(data_tmp["year"].values.astype(float).reshape(-1, 1)))
            ax = sns.lineplot(x=[data_tmp["year"].iloc[0], data_tmp["year"].iloc[-1]],
                              y=[data_tmp["prediction"].iloc[0], data_tmp["prediction"].iloc[-1]],
                              color=PALETTE[i], ax=ax, alpha=0.5, linewidth=6)
            
            # Use the regression line to obtain the slope over 2 and 10 years;
            slope = (np.log10(data_tmp["prediction"].iloc[-1]) - np.log10(data_tmp["prediction"].iloc[0])) / ((data_tmp["year"].iloc[-1] - data_tmp["year"].iloc[0]).days / 365)
            slope_2_years = 10**(slope * 2)
            slope_20_years = 10**(slope * 20)
            kind_increase[kind] = (slope_2_years, slope_20_years)
    ax.legend_.remove()  # Hack to remove legend;

    #####################
    # Add labels ########
    #####################
    
    # Associate a color to each kind of hardware (compute, memory, interconnection)
    def get_color(c):  # Make the color darker, to use it for text;
        hue, saturation, brightness = colors.rgb_to_hsv(colors.to_rgb(c))
        return sns.set_hls_values(c, l=brightness * 0.6, s=saturation * 0.7)
    kind_to_col = {k: get_color(PALETTE[i]) for i, k in enumerate(data["kind"].unique())}
    
    data["name"] = data["name"].fillna("")
    for i, row in data.iterrows():
        label = row["name"]
        # Label-specific adjustments;
        if label:
            if label ==  "Pentium II Xeon":
                xytext = (5, -9)
            elif label ==  "PCIe 4.0":
                xytext = (5, -9)
            elif label ==  "Radeon Fiji":
                xytext = (-7, 5)
            elif label ==  "TPUv2":
                xytext = (-7, 5)
            elif row["kind"] == "interconnect":
                xytext = (0, -9)
            else:
                xytext = (0, 5)
            ax.annotate(label, xy=(row["year"], row["performance"]), size=7, xytext=xytext,
                        textcoords="offset points", ha="center", color=kind_to_col[row["kind"]])
    
    #####################
    # Style fine-tuning #
    #####################
    
    # Log-scale y-axis;
    plt.yscale("log")
    
    # Turn on the grid;
    ax.yaxis.grid(True, linewidth=0.3)
    ax.xaxis.grid(True, linewidth=0.3)
    
    # Set tick number and parameters on x and y axes;
    def year_formatter(x, pos=None):
        d = num2date(x)
        if (d.year - X_LIMITS[0].year) % 3 != 0:
            return ""
        else:
            return d.year
    ax.xaxis.set_major_locator(YearLocator())
    ax.xaxis.set_minor_locator(MonthLocator(interval=3))
    ax.xaxis.set_major_formatter(FuncFormatter(year_formatter))
    ax.yaxis.set_major_locator(plt.LogLocator(base=10, numticks=15))
    ax.tick_params(axis="x", direction="out", which="both", bottom=True, top=False, labelsize=7, width=0.5, size=5)
    ax.tick_params(axis="x", direction="out", which="minor", size=2)  # Update size of minor ticks;
    ax.tick_params(axis="y", direction="out", which="both", left=True, right=False, labelsize=7, width=0.5, size=5)
    ax.tick_params(axis="y", direction="out", which="minor", size=2)  # Update size of minor ticks;
    
    # Ticks, showing relative performance;
    def format_speedup(l):
        if l >= 1:
            return str(int(l))
        else:
            return f"{l:.1f}"
    ax.set_yticklabels(labels=[format_speedup(l) + r"$\mathdefault{\times}$" for l in ax.get_yticks()], ha="right", fontsize=7)
 
    # Add a fake legend with summary data.
    # We don't use a real legend as we need rows with different colors and we don't want patches on the left.
    # Also, we want the text to look justified.
    def get_kind_label(k):
        kind_name = ""
        if k == "compute":
            kind_name = "HW FLOPS"
        elif k == "memory":
            kind_name = "DRAM BW"
        else:
            kind_name = "Interconnect BW"
        return kind_name
    # Create a rectangle used as background;
    rectangle = {"boxstyle": "round", "facecolor": "white", "alpha": 0.8, "edgecolor": "#B8B8B8", "linewidth": 0.5, "pad": 0.5}
    for i, (k, v) in enumerate(kind_increase.items()):
        pad = " " * 48 + "\n\n"  # Add padding to first label, to create a large rectangle that covers other labels; 
        # Use two annotations, to make the text look justified;
        ax.annotate(get_kind_label(k) + ":" + (pad if i == 0 else ""), xy=(0.023, 0.94 - 0.05 * i),
                    xycoords="axes fraction", fontsize=7, color=kind_to_col[k], ha="left", va="top", bbox=rectangle if i == 0 else None)
        ax.annotate(f"{v[1]:.0f}" + r"$\mathdefault{\times}$" + f"/20 years ({v[0]:.1f}" + r"$\mathdefault{\times}$"+ "/2 years)",
                    xy=(0.43, 0.941 - 0.05 * i), xycoords="axes fraction", fontsize=7, color=kind_to_col[k], ha="right", va="top")
        
    # Add axes labels;
    plt.ylabel("Performance Scaling", fontsize=8)
    plt.xlabel(None)
    
    return fig, ax
コード例 #43
0
 def pipeline(self, df: pd.DataFrame) -> pd.DataFrame:
     df = df.groupby("StatisticsDate",
                     as_index=False).sum().sort_values("StatisticsDate")
     return df.pipe(self.pipe_rename_columns).pipe(self.pipe_metadata)
コード例 #44
0
def test_nth():
    df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
    g = df.groupby('A')

    assert_frame_equal(g.nth(0), df.iloc[[0, 2]].set_index('A'))
    assert_frame_equal(g.nth(1), df.iloc[[1]].set_index('A'))
    assert_frame_equal(g.nth(2), df.loc[[]].set_index('A'))
    assert_frame_equal(g.nth(-1), df.iloc[[1, 2]].set_index('A'))
    assert_frame_equal(g.nth(-2), df.iloc[[0]].set_index('A'))
    assert_frame_equal(g.nth(-3), df.loc[[]].set_index('A'))
    assert_series_equal(g.B.nth(0), df.set_index('A').B.iloc[[0, 2]])
    assert_series_equal(g.B.nth(1), df.set_index('A').B.iloc[[1]])
    assert_frame_equal(g[['B']].nth(0),
                       df.loc[[0, 2], ['A', 'B']].set_index('A'))

    exp = df.set_index('A')
    assert_frame_equal(g.nth(0, dropna='any'), exp.iloc[[1, 2]])
    assert_frame_equal(g.nth(-1, dropna='any'), exp.iloc[[1, 2]])

    exp['B'] = np.nan
    assert_frame_equal(g.nth(7, dropna='any'), exp.iloc[[1, 2]])
    assert_frame_equal(g.nth(2, dropna='any'), exp.iloc[[1, 2]])

    # out of bounds, regression from 0.13.1
    # GH 6621
    df = DataFrame({'color': {0: 'green',
                              1: 'green',
                              2: 'red',
                              3: 'red',
                              4: 'red'},
                    'food': {0: 'ham',
                             1: 'eggs',
                             2: 'eggs',
                             3: 'ham',
                             4: 'pork'},
                    'two': {0: 1.5456590000000001,
                            1: -0.070345000000000005,
                            2: -2.4004539999999999,
                            3: 0.46206000000000003,
                            4: 0.52350799999999997},
                    'one': {0: 0.56573799999999996,
                            1: -0.9742360000000001,
                            2: 1.033801,
                            3: -0.78543499999999999,
                            4: 0.70422799999999997}}).set_index(['color',
                                                                 'food'])

    result = df.groupby(level=0, as_index=False).nth(2)
    expected = df.iloc[[-1]]
    assert_frame_equal(result, expected)

    result = df.groupby(level=0, as_index=False).nth(3)
    expected = df.loc[[]]
    assert_frame_equal(result, expected)

    # GH 7559
    # from the vbench
    df = DataFrame(np.random.randint(1, 10, (100, 2)), dtype='int64')
    s = df[1]
    g = df[0]
    expected = s.groupby(g).first()
    expected2 = s.groupby(g).apply(lambda x: x.iloc[0])
    assert_series_equal(expected2, expected, check_names=False)
    assert expected.name == 1
    assert expected2.name == 1

    # validate first
    v = s[g == 1].iloc[0]
    assert expected.iloc[0] == v
    assert expected2.iloc[0] == v

    # this is NOT the same as .first (as sorted is default!)
    # as it keeps the order in the series (and not the group order)
    # related GH 7287
    expected = s.groupby(g, sort=False).first()
    result = s.groupby(g, sort=False).nth(0, dropna='all')
    assert_series_equal(result, expected)

    # doc example
    df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
    g = df.groupby('A')
    # PR 17493, related to issue 11038
    # test Series.nth with True for dropna produces FutureWarning
    with assert_produces_warning(FutureWarning):
        result = g.B.nth(0, dropna=True)
    expected = g.B.first()
    assert_series_equal(result, expected)

    # test multiple nth values
    df = DataFrame([[1, np.nan], [1, 3], [1, 4], [5, 6], [5, 7]],
                   columns=['A', 'B'])
    g = df.groupby('A')

    assert_frame_equal(g.nth(0), df.iloc[[0, 3]].set_index('A'))
    assert_frame_equal(g.nth([0]), df.iloc[[0, 3]].set_index('A'))
    assert_frame_equal(g.nth([0, 1]), df.iloc[[0, 1, 3, 4]].set_index('A'))
    assert_frame_equal(
        g.nth([0, -1]), df.iloc[[0, 2, 3, 4]].set_index('A'))
    assert_frame_equal(
        g.nth([0, 1, 2]), df.iloc[[0, 1, 2, 3, 4]].set_index('A'))
    assert_frame_equal(
        g.nth([0, 1, -1]), df.iloc[[0, 1, 2, 3, 4]].set_index('A'))
    assert_frame_equal(g.nth([2]), df.iloc[[2]].set_index('A'))
    assert_frame_equal(g.nth([3, 4]), df.loc[[]].set_index('A'))

    business_dates = pd.date_range(start='4/1/2014', end='6/30/2014',
                                   freq='B')
    df = DataFrame(1, index=business_dates, columns=['a', 'b'])
    # get the first, fourth and last two business days for each month
    key = [df.index.year, df.index.month]
    result = df.groupby(key, as_index=False).nth([0, 3, -2, -1])
    expected_dates = pd.to_datetime(
        ['2014/4/1', '2014/4/4', '2014/4/29', '2014/4/30', '2014/5/1',
         '2014/5/6', '2014/5/29', '2014/5/30', '2014/6/2', '2014/6/5',
         '2014/6/27', '2014/6/30'])
    expected = DataFrame(1, columns=['a', 'b'], index=expected_dates)
    assert_frame_equal(result, expected)
コード例 #45
0
ファイル: stream.py プロジェクト: yenchih/mining
def data(ws, mongodb, slug):
    if not ws:
        abort(400, 'Expected WebSocket request.')

    DW = DataWarehouse()

    element = mongodb['element'].find_one({'slug': slug})

    element['page_limit'] = 50
    if request.GET.get('limit', True) is False:
        element['page_limit'] = 9999999999

    data = DW.get(element.get('cube'))
    columns = data.get('columns') or []

    fields = columns
    if request.GET.get('fields', None):
        fields = request.GET.get('fields').split(',')

    cube_last_update = mongodb['cube'].find_one({'slug': element.get('cube')})
    ws.send(json.dumps({'type': 'last_update',
                        'data': str(cube_last_update.get('lastupdate', ''))}))

    ws.send(json.dumps({'type': 'columns', 'data': fields}))

    filters = [i[0] for i in request.GET.iteritems()
               if len(i[0].split('filter__')) > 1]

    if element['type'] == 'grid':
        page = int(request.GET.get('page', 1))
        page_start = 0
        page_end = element['page_limit']
        if page >= 2:
            page_end = element['page_limit'] * page
            page_start = page_end - element['page_limit']
    else:
        page_start = None
        page_end = None

    df = DataFrame(data.get('data') or {}, columns=fields)
    if len(filters) >= 1:
        for f in filters:
            s = f.split('__')
            field = s[1]
            operator = s[2]
            value = request.GET.get(f)
            if operator == 'like':
                df = df[df[field].str.contains(value)]
            elif operator == 'regex':
                df = DataFrameSearchColumn(df, field, value, operator)
            else:
                df = df.query(df_generate(df, value, f))

    groupby = []
    if request.GET.get('groupby', None):
        groupby = request.GET.get('groupby', ).split(',')
    if len(groupby) >= 1:
        df = DataFrame(df.groupby(groupby).grouper.get_group_levels())

    if request.GET.get('orderby',
                       element.get('orderby', None)) and request.GET.get(
            'orderby', element.get('orderby', None)) in fields:

        orderby = request.GET.get('orderby', element.get('orderby', ''))
        if type(orderby) == str:
            orderby = orderby.split(',')
        orderby__order = request.GET.get('orderby__order',
                                         element.get('orderby__order', ''))
        if type(orderby__order) == str:
            orderby__order = orderby__order.split(',')
        ind = 0
        for orde in orderby__order:
            if orde == '0':
                orderby__order[ind] = False
            else:
                orderby__order[ind] = True
            ind += 1
        df = df.sort(orderby, ascending=orderby__order)

    ws.send(json.dumps({'type': 'max_page', 'data': len(df)}))

    # CLEAN MEMORY
    del filters, fields, columns
    gc.collect()
    categories = []
    for i in df.to_dict(outtype='records')[page_start:page_end]:
        if element.get('categories', None):
            categories.append(i[element.get('categories')])
        ws.send(json.dumps({'type': 'data', 'data': i}))

    # CLEAN MEMORY
    del df
    gc.collect()

    ws.send(json.dumps({'type': 'categories', 'data': categories}))
    ws.send(json.dumps({'type': 'close'}))

    # CLEAN MEMORY
    del categories
    gc.collect()
コード例 #46
0
def test_agg():
    # test with all three Resampler apis and TimeGrouper

    np.random.seed(1234)
    index = date_range(datetime(2005, 1, 1), datetime(2005, 1, 10), freq="D")
    index.name = "date"
    df = DataFrame(np.random.rand(10, 2), columns=list("AB"), index=index)
    df_col = df.reset_index()
    df_mult = df_col.copy()
    df_mult.index = pd.MultiIndex.from_arrays([range(10), df.index],
                                              names=["index", "date"])
    r = df.resample("2D")
    cases = [
        r,
        df_col.resample("2D", on="date"),
        df_mult.resample("2D", level="date"),
        df.groupby(pd.Grouper(freq="2D")),
    ]

    a_mean = r["A"].mean()
    a_std = r["A"].std()
    a_sum = r["A"].sum()
    b_mean = r["B"].mean()
    b_std = r["B"].std()
    b_sum = r["B"].sum()

    expected = pd.concat([a_mean, a_std, b_mean, b_std], axis=1)
    expected.columns = pd.MultiIndex.from_product([["A", "B"], ["mean",
                                                                "std"]])
    for t in cases:
        result = t.aggregate([np.mean, np.std])
        tm.assert_frame_equal(result, expected)

    expected = pd.concat([a_mean, b_std], axis=1)
    for t in cases:
        result = t.aggregate({"A": np.mean, "B": np.std})
        tm.assert_frame_equal(result, expected, check_like=True)

    expected = pd.concat([a_mean, a_std], axis=1)
    expected.columns = pd.MultiIndex.from_tuples([("A", "mean"), ("A", "std")])
    for t in cases:
        result = t.aggregate({"A": ["mean", "std"]})
        tm.assert_frame_equal(result, expected)

    expected = pd.concat([a_mean, a_sum], axis=1)
    expected.columns = ["mean", "sum"]
    for t in cases:
        result = t["A"].aggregate(["mean", "sum"])
    tm.assert_frame_equal(result, expected)

    msg = "nested renamer is not supported"
    for t in cases:
        with pytest.raises(pd.core.base.SpecificationError, match=msg):
            t.aggregate({"A": {"mean": "mean", "sum": "sum"}})

    expected = pd.concat([a_mean, a_sum, b_mean, b_sum], axis=1)
    expected.columns = pd.MultiIndex.from_tuples([("A", "mean"), ("A", "sum"),
                                                  ("B", "mean2"),
                                                  ("B", "sum2")])
    for t in cases:
        with pytest.raises(pd.core.base.SpecificationError, match=msg):
            t.aggregate({
                "A": {
                    "mean": "mean",
                    "sum": "sum"
                },
                "B": {
                    "mean2": "mean",
                    "sum2": "sum"
                },
            })

    expected = pd.concat([a_mean, a_std, b_mean, b_std], axis=1)
    expected.columns = pd.MultiIndex.from_tuples([("A", "mean"), ("A", "std"),
                                                  ("B", "mean"), ("B", "std")])
    for t in cases:
        result = t.aggregate({"A": ["mean", "std"], "B": ["mean", "std"]})
        tm.assert_frame_equal(result, expected, check_like=True)

    expected = pd.concat([a_mean, a_sum, b_mean, b_sum], axis=1)
    expected.columns = pd.MultiIndex.from_tuples([
        ("r1", "A", "mean"),
        ("r1", "A", "sum"),
        ("r2", "B", "mean"),
        ("r2", "B", "sum"),
    ])
コード例 #47
0
def test_groupby_aggregate_empty_key_empty_return():
    # GH: 32580 Check if everything works, when return is empty
    df = DataFrame({"a": [1, 1, 2], "b": [1, 2, 3], "c": [1, 2, 4]})
    result = df.groupby("a").agg({"b": []})
    expected = DataFrame(columns=pd.MultiIndex(levels=[["b"], []], codes=[[], []]))
    tm.assert_frame_equal(result, expected)
コード例 #48
0
    def _plot_per_cenwave(self, df: pd.DataFrame, shift: str, outliers: pd.DataFrame = None) -> int:
        """Plot shift v time and A-B v time by grating/cenwave"""
        trace_number = 0  # Keep track of the number of traces created and added

        groups = df.groupby(['OPT_ELEM', 'CENWAVE'])

        # Set symbols for different FP-POS
        fp_symbols = {
            1: 'circle',
            2: 'cross',
            3: 'triangle-up',
            4: 'x'
        }

        # Compute A-B shift difference
        seg_diff_results = compute_segment_diff(df, shift, 'FUVA', 'FUVB')

        # Plot A-B v time
        self.figure.add_trace(
            go.Scattergl(
                x=seg_diff_results.lamp_time,
                y=seg_diff_results.seg_diff,
                name='FUVA - FUVB',
                mode='markers',
                text=seg_diff_results.hover_text,
                visible=False,
            ),
            row=1,
            col=1
        )
        trace_number += 1

        # Plot shift v time per grating/cenwave group
        for i, (name, group) in enumerate(groups):
            trace_number += 1

            grating, cenwave = name
            lamp_time = absolute_time(df=group)

            self.figure.add_trace(
                go.Scattergl(
                    x=lamp_time.to_datetime(),
                    y=group[shift],
                    name=f'{grating}-{cenwave}',
                    mode='markers',
                    text=group.hover_text,
                    visible=False,
                    marker=dict(  # Color markers based on cenwave
                        cmax=len(df.CENWAVE.unique()) - 1,  # Individual plots need to be on the same scale
                        cmin=0,
                        color=list(repeat(i, len(group))),
                        colorscale='Viridis',
                        symbol=[fp_symbols[fp] for fp in group.FPPOS],
                        size=[
                            10 if time > LP_MOVES[4] and lp == 3 else 6
                            for lp, time in zip(group.LIFE_ADJ, Time(group.EXPSTART, format='mjd').to_datetime())
                        ]  # Set the size to distinguish exposures taken at LP3 after the move to LP4
                    )
                ),
                row=2,
                col=1,
            )

        if outliers is not None:
            self.figure.add_trace(
                go.Scattergl(
                    x=outliers.lamp_time,
                    y=outliers.seg_diff,
                    name='A - B Outliers',
                    mode='markers',
                    text=outliers.hover_text,
                    visible=False,
                    marker=dict(color='red'),
                ),
                row=1,
                col=1
            )
            trace_number += 1

            # Plot outlier points in a different color
            outlier_mainplot = df[df.apply(lambda x: x.ROOTNAME in outliers.ROOTNAME.values, axis=1)]
            outlier_groups = outlier_mainplot.groupby(['OPT_ELEM', 'CENWAVE'])
            for name, group in outlier_groups:
                trace_number += 1

                grating, cenwave = name
                lamp_time = absolute_time(df=group)

                self.figure.add_trace(
                    go.Scattergl(
                        x=lamp_time.to_datetime(),
                        y=group[shift],
                        name=f'{grating}-{cenwave} Outliers',
                        mode='markers',
                        text=group.hover_text,
                        visible=False,
                        marker=dict(
                            color='red',
                            symbol=[fp_symbols[fp] for fp in group.FPPOS],
                            size=[
                                10 if time > LP_MOVES[4] and lp == 3 else 6
                                for lp, time in zip(group.LIFE_ADJ, Time(group.EXPSTART, format='mjd').to_datetime())
                            ]  # Set the size to distinguish exposures taken at LP3 after the move to LP4
                        )
                    ),
                    row=2,
                    col=1,
                )

        return trace_number
コード例 #49
0
    def test_cython_transform_frame(self, op, args, targop):
        s = Series(np.random.randn(1000))
        s_missing = s.copy()
        s_missing.iloc[2:10] = np.nan
        labels = np.random.randint(0, 50, size=1000).astype(float)
        strings = list('qwertyuiopasdfghjklz')
        strings_missing = strings[:]
        strings_missing[5] = np.nan
        df = DataFrame({
            'float':
            s,
            'float_missing':
            s_missing,
            'int': [1, 1, 1, 1, 2] * 200,
            'datetime':
            pd.date_range('1990-1-1', periods=1000),
            'timedelta':
            pd.timedelta_range(1, freq='s', periods=1000),
            'string':
            strings * 50,
            'string_missing':
            strings_missing * 50
        })
        df['cat'] = df['string'].astype('category')

        df2 = df.copy()
        df2.index = pd.MultiIndex.from_product([range(100), range(10)])

        # DataFrame - Single and MultiIndex,
        # group by values, index level, columns
        for df in [df, df2]:
            for gb_target in [
                    dict(by=labels),
                    dict(level=0),
                    dict(by='string')
            ]:  # dict(by='string_missing')]:
                # dict(by=['int','string'])]:

                gb = df.groupby(**gb_target)
                # whitelisted methods set the selection before applying
                # bit a of hack to make sure the cythonized shift
                # is equivalent to pre 0.17.1 behavior
                if op == 'shift':
                    gb._set_group_selection()

                if op != 'shift' and 'int' not in gb_target:
                    # numeric apply fastpath promotes dtype so have
                    # to apply separately and concat
                    i = gb[['int']].apply(targop)
                    f = gb[['float', 'float_missing']].apply(targop)
                    expected = pd.concat([f, i], axis=1)
                else:
                    expected = gb.apply(targop)

                expected = expected.sort_index(axis=1)
                tm.assert_frame_equal(
                    expected,
                    gb.transform(op, *args).sort_index(axis=1))
                tm.assert_frame_equal(expected, getattr(gb, op)(*args))
                # individual columns
                for c in df:
                    if c not in ['float', 'int', 'float_missing'
                                 ] and op != 'shift':
                        pytest.raises(DataError, gb[c].transform, op)
                        pytest.raises(DataError, getattr(gb[c], op))
                    else:
                        expected = gb[c].apply(targop)
                        expected.name = c
                        tm.assert_series_equal(expected,
                                               gb[c].transform(op, *args))
                        tm.assert_series_equal(expected,
                                               getattr(gb[c], op)(*args))
コード例 #50
0
    def _plot_per_grating(self, df: pd.DataFrame):
        trace_number = 0  # Keep track of the number of traces created and added

        all_b_c_outliers = self.results['B-C'][self.outliers['B-C']]
        all_c_a_outliers = self.results['C-A'][self.outliers['C-A']]

        # Find matching stripe differences and outliers
        b_c = match_dfs(self.results['B-C'], df, 'ROOTNAME')
        c_a = match_dfs(self.results['C-A'], df, 'ROOTNAME')

        b_c_outliers = match_dfs(all_b_c_outliers, df, 'ROOTNAME') if not all_b_c_outliers.empty else None
        c_a_outliers = match_dfs(all_c_a_outliers, df, 'ROOTNAME') if not all_c_a_outliers.empty else None

        # Plot diffs v time
        if not b_c.empty:
            self.figure.add_trace(
                go.Scattergl(
                    x=b_c.lamp_time,
                    y=b_c.seg_diff,
                    name='NUVB - NUVC',
                    mode='markers',
                    text=b_c.hover_text,
                    visible=False,
                    marker=dict(color='#1f77b4')  # "muted blue"
                ),
                row=1,
                col=1
            )
            trace_number += 1

        if c_a is not None and not c_a.empty:
            self.figure.add_trace(
                go.Scattergl(
                    x=c_a.lamp_time,
                    y=c_a.seg_diff,
                    name='NUVC - NUVA',
                    mode='markers',
                    text=c_a.hover_text,
                    visible=False,
                    marker=dict(color='#1f77b4')
                ),
                row=2,
                col=1
            )
            trace_number += 1

        # Plot shift v time per grating group
        groups = df.groupby('OPT_ELEM')

        for i, (grating, group) in enumerate(groups):
            trace_number += 2

            abstime = absolute_time(df=group)
            group = group.set_index(abstime.to_datetime())
            group = group.sort_index()

            rolling_mean = group.rolling('180D').mean()

            self.figure.add_trace(
                go.Scattergl(
                    x=group.index,
                    y=group[self.shift],
                    name=grating,
                    mode='markers',
                    text=group.hover_text,
                    visible=False,
                    marker=dict(
                        cmax=len(df.OPT_ELEM.unique()) - 1,  # Individual plots need to be on the same scale
                        cmin=0,
                        color=list(repeat(i, len(group))),
                        colorscale='Viridis',
                        opacity=0.5
                    )
                ),
                row=3,
                col=1,
            )

            # Plot a rolling average of the shift value
            self.figure.add_trace(
                go.Scattergl(
                    x=rolling_mean.index,
                    y=rolling_mean[self.shift],
                    name=f'{grating} Rolling Mean',
                    mode='lines',
                    visible=False
                ),
                row=3,
                col=1
            )

        # Plot each set of potential outliers
        outlier_sets = [b_c_outliers, c_a_outliers]
        position = [(1, 1), (2, 1)]
        labels = ['B-C Outliers', 'C-A Outliers']
        for outliers, (row, col), label in zip(outlier_sets, position, labels):
            if outliers is not None and not outliers.empty:
                self.figure.add_trace(
                    go.Scattergl(
                        x=outliers.lamp_time,
                        y=outliers.seg_diff,
                        name=label,
                        mode='markers',
                        text=outliers.hover_text,
                        visible=False,
                        marker=dict(color='red'),
                    ),
                    row=row,
                    col=col
                )
                trace_number += 1

                # Plot outlier points in a different color
                outliers_main = match_dfs(df, outliers, 'ROOTNAME')
                outlier_groups = outliers_main.groupby('OPT_ELEM')
                for grating, group in outlier_groups:
                    trace_number += 1

                    lamp_time = absolute_time(df=group)

                    self.figure.add_trace(
                        go.Scattergl(
                            x=lamp_time.to_datetime(),
                            y=group[self.shift],
                            name=f'{grating} {label}',
                            mode='markers',
                            text=group.hover_text,
                            visible=False,
                            marker=dict(color='red'),
                            legendgroup=f'{grating} outliers'
                        ),
                        row=3,
                        col=1,
                    )

        return trace_number
コード例 #51
0
def test_sort_datetimelike():
    # GH10505

    # use same data as test_groupby_sort_categorical, which category is
    # corresponding to datetime.month
    df = DataFrame(
        {
            'dt': [
                datetime(2011, 7, 1),
                datetime(2011, 7, 1),
                datetime(2011, 2, 1),
                datetime(2011, 5, 1),
                datetime(2011, 2, 1),
                datetime(2011, 1, 1),
                datetime(2011, 5, 1)
            ],
            'foo': [10, 8, 5, 6, 4, 1, 7],
            'bar': [10, 20, 30, 40, 50, 60, 70]
        },
        columns=['dt', 'foo', 'bar'])

    # ordered=True
    df['dt'] = Categorical(df['dt'], ordered=True)
    index = [
        datetime(2011, 1, 1),
        datetime(2011, 2, 1),
        datetime(2011, 5, 1),
        datetime(2011, 7, 1)
    ]
    result_sort = DataFrame([[1, 60], [5, 30], [6, 40], [10, 10]],
                            columns=['foo', 'bar'])
    result_sort.index = CategoricalIndex(index, name='dt', ordered=True)

    index = [
        datetime(2011, 7, 1),
        datetime(2011, 2, 1),
        datetime(2011, 5, 1),
        datetime(2011, 1, 1)
    ]
    result_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]],
                              columns=['foo', 'bar'])
    result_nosort.index = CategoricalIndex(index,
                                           categories=index,
                                           name='dt',
                                           ordered=True)

    col = 'dt'
    assert_frame_equal(result_sort,
                       df.groupby(col, sort=True, observed=False).first())

    # when categories is ordered, group is ordered by category's order
    assert_frame_equal(result_sort,
                       df.groupby(col, sort=False, observed=False).first())

    # ordered = False
    df['dt'] = Categorical(df['dt'], ordered=False)
    index = [
        datetime(2011, 1, 1),
        datetime(2011, 2, 1),
        datetime(2011, 5, 1),
        datetime(2011, 7, 1)
    ]
    result_sort = DataFrame([[1, 60], [5, 30], [6, 40], [10, 10]],
                            columns=['foo', 'bar'])
    result_sort.index = CategoricalIndex(index, name='dt')

    index = [
        datetime(2011, 7, 1),
        datetime(2011, 2, 1),
        datetime(2011, 5, 1),
        datetime(2011, 1, 1)
    ]
    result_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]],
                              columns=['foo', 'bar'])
    result_nosort.index = CategoricalIndex(index, categories=index, name='dt')

    col = 'dt'
    assert_frame_equal(result_sort,
                       df.groupby(col, sort=True, observed=False).first())
    assert_frame_equal(result_nosort,
                       df.groupby(col, sort=False, observed=False).first())
コード例 #52
0
def test_groupby_get_by_index():
    # GH 33439
    df = DataFrame({"A": ["S", "W", "W"], "B": [1.0, 1.0, 2.0]})
    res = df.groupby("A").agg({"B": lambda x: x.get(x.index[-1])})
    expected = DataFrame({"A": ["S", "W"], "B": [1.0, 2.0]}).set_index("A")
    pd.testing.assert_frame_equal(res, expected)
コード例 #53
0
def test_basic():

    cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"],
                       categories=["a", "b", "c", "d"],
                       ordered=True)
    data = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 4, 5], "b": cats})

    exp_index = CategoricalIndex(list('abcd'), name='b', ordered=True)
    expected = DataFrame({'a': [1, 2, 4, np.nan]}, index=exp_index)
    result = data.groupby("b", observed=False).mean()
    tm.assert_frame_equal(result, expected)

    cat1 = Categorical(["a", "a", "b", "b"],
                       categories=["a", "b", "z"],
                       ordered=True)
    cat2 = Categorical(["c", "d", "c", "d"],
                       categories=["c", "d", "y"],
                       ordered=True)
    df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]})

    # single grouper
    gb = df.groupby("A", observed=False)
    exp_idx = CategoricalIndex(['a', 'b', 'z'], name='A', ordered=True)
    expected = DataFrame({'values': Series([3, 7, 0], index=exp_idx)})
    result = gb.sum()
    tm.assert_frame_equal(result, expected)

    # GH 8623
    x = DataFrame([[1, 'John P. Doe'], [2, 'Jane Dove'], [1, 'John P. Doe']],
                  columns=['person_id', 'person_name'])
    x['person_name'] = Categorical(x.person_name)

    g = x.groupby(['person_id'], observed=False)
    result = g.transform(lambda x: x)
    tm.assert_frame_equal(result, x[['person_name']])

    result = x.drop_duplicates('person_name')
    expected = x.iloc[[0, 1]]
    tm.assert_frame_equal(result, expected)

    def f(x):
        return x.drop_duplicates('person_name').iloc[0]

    result = g.apply(f)
    expected = x.iloc[[0, 1]].copy()
    expected.index = Index([1, 2], name='person_id')
    expected['person_name'] = expected['person_name'].astype('object')
    tm.assert_frame_equal(result, expected)

    # GH 9921
    # Monotonic
    df = DataFrame({"a": [5, 15, 25]})
    c = pd.cut(df.a, bins=[0, 10, 20, 30, 40])

    result = df.a.groupby(c, observed=False).transform(sum)
    tm.assert_series_equal(result, df['a'])

    tm.assert_series_equal(
        df.a.groupby(c, observed=False).transform(lambda xs: np.sum(xs)),
        df['a'])
    tm.assert_frame_equal(
        df.groupby(c, observed=False).transform(sum), df[['a']])
    tm.assert_frame_equal(
        df.groupby(c, observed=False).transform(lambda xs: np.max(xs)),
        df[['a']])

    # Filter
    tm.assert_series_equal(
        df.a.groupby(c, observed=False).filter(np.all), df['a'])
    tm.assert_frame_equal(df.groupby(c, observed=False).filter(np.all), df)

    # Non-monotonic
    df = DataFrame({"a": [5, 15, 25, -5]})
    c = pd.cut(df.a, bins=[-10, 0, 10, 20, 30, 40])

    result = df.a.groupby(c, observed=False).transform(sum)
    tm.assert_series_equal(result, df['a'])

    tm.assert_series_equal(
        df.a.groupby(c, observed=False).transform(lambda xs: np.sum(xs)),
        df['a'])
    tm.assert_frame_equal(
        df.groupby(c, observed=False).transform(sum), df[['a']])
    tm.assert_frame_equal(
        df.groupby(c, observed=False).transform(lambda xs: np.sum(xs)),
        df[['a']])

    # GH 9603
    df = DataFrame({'a': [1, 0, 0, 0]})
    c = pd.cut(df.a, [0, 1, 2, 3, 4], labels=Categorical(list('abcd')))
    result = df.groupby(c, observed=False).apply(len)

    exp_index = CategoricalIndex(c.values.categories, ordered=c.values.ordered)
    expected = Series([1, 0, 0, 0], index=exp_index)
    expected.index.name = 'a'
    tm.assert_series_equal(result, expected)

    # more basic
    levels = ['foo', 'bar', 'baz', 'qux']
    codes = np.random.randint(0, 4, size=100)

    cats = Categorical.from_codes(codes, levels, ordered=True)

    data = DataFrame(np.random.randn(100, 4))

    result = data.groupby(cats, observed=False).mean()

    expected = data.groupby(np.asarray(cats), observed=False).mean()
    exp_idx = CategoricalIndex(levels,
                               categories=cats.categories,
                               ordered=True)
    expected = expected.reindex(exp_idx)

    assert_frame_equal(result, expected)

    grouped = data.groupby(cats, observed=False)
    desc_result = grouped.describe()

    idx = cats.codes.argsort()
    ord_labels = np.asarray(cats).take(idx)
    ord_data = data.take(idx)

    exp_cats = Categorical(ord_labels,
                           ordered=True,
                           categories=['foo', 'bar', 'baz', 'qux'])
    expected = ord_data.groupby(exp_cats, sort=False,
                                observed=False).describe()
    assert_frame_equal(desc_result, expected)

    # GH 10460
    expc = Categorical.from_codes(np.arange(4).repeat(8), levels, ordered=True)
    exp = CategoricalIndex(expc)
    tm.assert_index_equal((desc_result.stack().index.get_level_values(0)), exp)
    exp = Index(['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'] *
                4)
    tm.assert_index_equal((desc_result.stack().index.get_level_values(1)), exp)
コード例 #54
0
def test_func_duplicates_raises():
    # GH28426
    msg = "Function names"
    df = DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]})
    with pytest.raises(SpecificationError, match=msg):
        df.groupby("A").agg(["min", "min"])
コード例 #55
0
                        eigen_solver='arpack',
                        affinity=digits_affin).fit_predict(digits_X))
 moons_spectrals.append(
     SpectralClustering(n_clusters=moons_num_clusters,
                        eigen_solver='arpack',
                        affinity=moons_affin).fit_predict(moons_X))
 print(
     f"*****************Metrics of Spectral Clustering****************************\n"
     f"Digits model #{i+1}: \n {digits_num_clusters} clusters, {digits_affin} affinity"
     f"\nMoons model #{i+1}: \n {moons_num_clusters} clusters, {moons_affin} affinity"
     f"\n{clustering_metrics(moons_spectrals[i], digits_spectrals[i])}")
 df = DataFrame(
     dict(x=moons_X[:, 0], y=moons_X[:, 1], label=moons_spectrals[i]))
 colors = {0: 'orange', 1: 'purple'}
 fig, ax = plt.subplots()
 grouped = df.groupby('label')
 for key, group in grouped:
     group.plot(
         ax=ax,
         kind='scatter',
         x='x',
         y='y',
         label=key,
         color=colors[key],
         title=f'Spectral Clustering on Moons, affinity = {moons_affin}')
 plt.show()
 skplt.metrics.plot_silhouette(
     digits_X,
     digits_spectrals[i],
     title=
     f'Spectral Clustering on Digits Silhouette Analysis, affinity = {digits_affin}'
コード例 #56
0
def test_agg_index_has_complex_internals(index):
    # GH 31223
    df = DataFrame({"group": [1, 1, 2], "value": [0, 1, 0]}, index=index)
    result = df.groupby("group").agg({"value": Series.nunique})
    expected = DataFrame({"group": [1, 2], "value": [2, 1]}).set_index("group")
    tm.assert_frame_equal(result, expected)
コード例 #57
0
ファイル: utils.py プロジェクト: wusuowei60/single-cell
def highly_variable_genes_single_batch_seurat(
    adata: sparse.spmatrix,  # log transformed, base e
    genes: pd.DataFrame,
    layer=None,
    min_disp=0.5,
    max_disp=np.inf,
    min_mean=0.0125,
    max_mean=3,
    n_top_genes: int = 0,
    n_bins=20,
    flavor='seurat'
) -> None:
    X = adata.layers[layer] if layer is not None else adata#.X
    
    if flavor == 'seurat':
        # 如果不是以e为底的先变成以e为底
        X = np.expm1(X)
        # 然后还原
        
    mean, var = my_get_mean_var(X, axis='gene')
    mean[mean == 0] = 1e-12
    dispersion = var / mean
    if flavor == 'seurat':
        dispersion[dispersion == 0] = np.nan
        dispersion = np.log(dispersion)
        mean = np.log1p(mean)

    genes['dispersions'] = dispersion
    genes['means'] = mean
    genes['vars'] = var
    
    if flavor == 'seurat':
        genes['mean_bin'] = pd.cut(genes.means, bins=n_bins)
        disp_grouped = genes.groupby('mean_bin')['dispersions']
        
        single_bin_gene = []

        def find_nan_interval(x):
            if len(x) == 1:
                single_bin_gene.extend(x.index)
                std, mean = x.mean(), 0
            else:
                mean = x.mean()
                std = x.std(ddof=1)
            return (x - mean) / std
        
        genes['dispersions_norm'] = disp_grouped.transform(lambda x: find_nan_interval(x))
        if len(single_bin_gene) > 0:
            print(
                f'Gene indices {single_bin_gene} fell into a single bin: their '
                'normalized dispersion was set to 1.',
                '    Decreasing `n_bins` will likely avoid this effect.'
            )
    
    if n_top_genes > adata.shape[1]:
        print(f'`n_top_genes` > `adata.n_var`, returning all genes.')
        genes['highly_variable'] = np.ones(adata.shape[1], dtype=bool)
    elif n_top_genes > 0:
        genes_largest = genes.nlargest(n_top_genes, 'dispersion_norm')
        disp_cut_off = genes_largest.dispersion_norm[-1]
        genes['highly_variable'] = np.zeros(adata.shape[1], dtype=bool)
        genes.highly_variable.loc[genes_largest] == True
        print(
            f'the {n_top_genes} top genes correspond to a '
            f'normalized dispersion cutoff of {disp_cut_off}'
        )
    else:
        dispersion_norm = genes.dispersions_norm.values.astype('float32')
        np.nan_to_num(dispersion_norm)  # similar to Seurat
        gene_subset = np.logical_and.reduce(
            (
                mean > min_mean,
                mean < max_mean,
                dispersion_norm > min_disp,
                dispersion_norm < max_disp,
            )
        )
        genes['highly_variable'] = gene_subset

    sns.scatterplot(data=genes, x="means", y="dispersions", hue="highly_variable", s=7, alpha=0.5)
    plt.savefig('6.jpg')
    plt.cla()

    sns.scatterplot(data=genes, x="means", y="dispersions_norm", hue="highly_variable", s=7, alpha=0.5)
    plt.savefig('7.jpg')
    plt.cla()

    return None
コード例 #58
0
 def test_missing_raises(self):
     df = DataFrame({"A": [0, 1], "B": [1, 2]})
     with pytest.raises(KeyError, match="Column 'C' does not exist"):
         df.groupby("A").agg(c=("C", "sum"))
コード例 #59
0
    def test_to_latex_multiindex(self):
        df = DataFrame({('x', 'y'): ['a']})
        result = df.to_latex()
        expected = r"""\begin{tabular}{ll}
\toprule
{} &  x \\
{} &  y \\
\midrule
0 &  a \\
\bottomrule
\end{tabular}
"""

        assert result == expected

        result = df.T.to_latex()
        expected = r"""\begin{tabular}{lll}
\toprule
  &   &  0 \\
\midrule
x & y &  a \\
\bottomrule
\end{tabular}
"""

        assert result == expected

        df = DataFrame.from_dict({
            ('c1', 0):
            pd.Series({x: x
                       for x in range(4)}),
            ('c1', 1):
            pd.Series({x: x + 4
                       for x in range(4)}),
            ('c2', 0):
            pd.Series({x: x
                       for x in range(4)}),
            ('c2', 1):
            pd.Series({x: x + 4
                       for x in range(4)}),
            ('c3', 0):
            pd.Series({x: x
                       for x in range(4)}),
        }).T
        result = df.to_latex()
        expected = r"""\begin{tabular}{llrrrr}
\toprule
   &   &  0 &  1 &  2 &  3 \\
\midrule
c1 & 0 &  0 &  1 &  2 &  3 \\
   & 1 &  4 &  5 &  6 &  7 \\
c2 & 0 &  0 &  1 &  2 &  3 \\
   & 1 &  4 &  5 &  6 &  7 \\
c3 & 0 &  0 &  1 &  2 &  3 \\
\bottomrule
\end{tabular}
"""

        assert result == expected

        # GH 14184
        df = df.T
        df.columns.names = ['a', 'b']
        result = df.to_latex()
        expected = r"""\begin{tabular}{lrrrrr}
\toprule
a & \multicolumn{2}{l}{c1} & \multicolumn{2}{l}{c2} & c3 \\
b &  0 &  1 &  0 &  1 &  0 \\
\midrule
0 &  0 &  4 &  0 &  4 &  0 \\
1 &  1 &  5 &  1 &  5 &  1 \\
2 &  2 &  6 &  2 &  6 &  2 \\
3 &  3 &  7 &  3 &  7 &  3 \\
\bottomrule
\end{tabular}
"""
        assert result == expected

        # GH 10660
        df = pd.DataFrame({
            'a': [0, 0, 1, 1],
            'b': list('abab'),
            'c': [1, 2, 3, 4]
        })
        result = df.set_index(['a', 'b']).to_latex()
        expected = r"""\begin{tabular}{llr}
\toprule
  &   &  c \\
a & b &    \\
\midrule
0 & a &  1 \\
  & b &  2 \\
1 & a &  3 \\
  & b &  4 \\
\bottomrule
\end{tabular}
"""

        assert result == expected

        result = df.groupby('a').describe().to_latex()
        expected = r"""\begin{tabular}{lrrrrrrrr}
\toprule
{} & \multicolumn{8}{l}{c} \\
{} & count & mean &       std &  min &   25\% &  50\% &   75\% &  max \\
a &       &      &           &      &       &      &       &      \\
\midrule
0 &   2.0 &  1.5 &  0.707107 &  1.0 &  1.25 &  1.5 &  1.75 &  2.0 \\
1 &   2.0 &  3.5 &  0.707107 &  3.0 &  3.25 &  3.5 &  3.75 &  4.0 \\
\bottomrule
\end{tabular}
"""

        assert result == expected
コード例 #60
0
def test_agg_misc():
    # test with all three Resampler apis and TimeGrouper

    np.random.seed(1234)
    index = date_range(datetime(2005, 1, 1), datetime(2005, 1, 10), freq="D")
    index.name = "date"
    df = DataFrame(np.random.rand(10, 2), columns=list("AB"), index=index)
    df_col = df.reset_index()
    df_mult = df_col.copy()
    df_mult.index = pd.MultiIndex.from_arrays([range(10), df.index],
                                              names=["index", "date"])

    r = df.resample("2D")
    cases = [
        r,
        df_col.resample("2D", on="date"),
        df_mult.resample("2D", level="date"),
        df.groupby(pd.Grouper(freq="2D")),
    ]

    # passed lambda
    for t in cases:
        result = t.agg({"A": np.sum, "B": lambda x: np.std(x, ddof=1)})
        rcustom = t["B"].apply(lambda x: np.std(x, ddof=1))
        expected = pd.concat([r["A"].sum(), rcustom], axis=1)
        tm.assert_frame_equal(result, expected, check_like=True)

    # agg with renamers
    expected = pd.concat(
        [t["A"].sum(), t["B"].sum(), t["A"].mean(), t["B"].mean()], axis=1)
    expected.columns = pd.MultiIndex.from_tuples([("result1", "A"),
                                                  ("result1", "B"),
                                                  ("result2", "A"),
                                                  ("result2", "B")])

    msg = r"Column\(s\) \['result1', 'result2'\] do not exist"
    for t in cases:
        with pytest.raises(pd.core.base.SpecificationError, match=msg):
            t[["A", "B"]].agg({"result1": np.sum, "result2": np.mean})

    # agg with different hows
    expected = pd.concat(
        [t["A"].sum(), t["A"].std(), t["B"].mean(), t["B"].std()], axis=1)
    expected.columns = pd.MultiIndex.from_tuples([("A", "sum"), ("A", "std"),
                                                  ("B", "mean"), ("B", "std")])
    for t in cases:
        result = t.agg({"A": ["sum", "std"], "B": ["mean", "std"]})
        tm.assert_frame_equal(result, expected, check_like=True)

    # equivalent of using a selection list / or not
    for t in cases:
        result = t[["A", "B"]].agg({"A": ["sum", "std"], "B": ["mean", "std"]})
        tm.assert_frame_equal(result, expected, check_like=True)

    msg = "nested renamer is not supported"

    # series like aggs
    for t in cases:
        with pytest.raises(pd.core.base.SpecificationError, match=msg):
            t["A"].agg({"A": ["sum", "std"]})

        with pytest.raises(pd.core.base.SpecificationError, match=msg):
            t["A"].agg({"A": ["sum", "std"], "B": ["mean", "std"]})

    # errors
    # invalid names in the agg specification
    msg = "\"Column 'B' does not exist!\""
    for t in cases:
        with pytest.raises(KeyError, match=msg):
            t[["A"]].agg({"A": ["sum", "std"], "B": ["mean", "std"]})