Example #1
0
    def test_groupby_categorical(self):
        levels = ["foo", "bar", "baz", "qux"]
        codes = np.random.randint(0, 4, size=100)

        cats = Categorical.from_codes(codes, levels, ordered=True)

        data = DataFrame(np.random.randn(100, 4))

        result = data.groupby(cats).mean()

        expected = data.groupby(np.asarray(cats)).mean()
        exp_idx = CategoricalIndex(levels, categories=cats.categories, ordered=True)
        expected = expected.reindex(exp_idx)

        assert_frame_equal(result, expected)

        grouped = data.groupby(cats)
        desc_result = grouped.describe()

        idx = cats.codes.argsort()
        ord_labels = np.asarray(cats).take(idx)
        ord_data = data.take(idx)

        exp_cats = Categorical(ord_labels, ordered=True, categories=["foo", "bar", "baz", "qux"])
        expected = ord_data.groupby(exp_cats, sort=False).describe()
        expected.index.names = [None, None]
        assert_frame_equal(desc_result, expected)

        # GH 10460
        expc = Categorical.from_codes(np.arange(4).repeat(8), levels, ordered=True)
        exp = CategoricalIndex(expc)
        self.assert_index_equal(desc_result.index.get_level_values(0), exp)
        exp = Index(["count", "mean", "std", "min", "25%", "50%", "75%", "max"] * 4)
        self.assert_index_equal(desc_result.index.get_level_values(1), exp)
Example #2
0
    def test_filter_using_len(self):
        # BUG GH4447
        df = DataFrame({'A': np.arange(8),
                        'B': list('aabbbbcc'),
                        'C': np.arange(8)})
        grouped = df.groupby('B')
        actual = grouped.filter(lambda x: len(x) > 2)
        expected = DataFrame(
            {'A': np.arange(2, 6),
             'B': list('bbbb'),
             'C': np.arange(2, 6)}, index=np.arange(2, 6))
        assert_frame_equal(actual, expected)

        actual = grouped.filter(lambda x: len(x) > 4)
        expected = df.ix[[]]
        assert_frame_equal(actual, expected)

        # Series have always worked properly, but we'll test anyway.
        s = df['B']
        grouped = s.groupby(s)
        actual = grouped.filter(lambda x: len(x) > 2)
        expected = Series(4 * ['b'], index=np.arange(2, 6), name='B')
        assert_series_equal(actual, expected)

        actual = grouped.filter(lambda x: len(x) > 4)
        expected = s[[]]
        assert_series_equal(actual, expected)
Example #3
0
    def test_fromRecords_toRecords(self):
        # structured array
        K = 10

        recs = np.zeros(K, dtype="O,O,f8,f8")
        recs["f0"] = range(K / 2) * 2
        recs["f1"] = np.arange(K) / (K / 2)
        recs["f2"] = np.arange(K) * 2
        recs["f3"] = np.arange(K)

        lp = LongPanel.fromRecords(recs, "f0", "f1")
        self.assertEqual(len(lp.items), 2)

        lp = LongPanel.fromRecords(recs, "f0", "f1", exclude=["f2"])
        self.assertEqual(len(lp.items), 1)

        torecs = lp.toRecords()
        self.assertEqual(len(torecs.dtype.names), len(lp.items) + 2)

        # DataFrame
        df = DataFrame.from_records(recs)
        lp = LongPanel.fromRecords(df, "f0", "f1", exclude=["f2"])
        self.assertEqual(len(lp.items), 1)

        # dict of arrays
        series = DataFrame.from_records(recs)._series
        lp = LongPanel.fromRecords(series, "f0", "f1", exclude=["f2"])
        self.assertEqual(len(lp.items), 1)
        self.assert_("f2" in series)

        self.assertRaises(Exception, LongPanel.fromRecords, np.zeros((3, 3)), 0, 1)
Example #4
0
    def test_apply_categorical_data(self):
        # GH 10138
        for ordered in [True, False]:
            dense = Categorical(list('abc'), ordered=ordered)
            # 'b' is in the categories but not in the list
            missing = Categorical(
                list('aaa'), categories=['a', 'b'], ordered=ordered)
            values = np.arange(len(dense))
            df = DataFrame({'missing': missing,
                            'dense': dense,
                            'values': values})
            grouped = df.groupby(['missing', 'dense'])

            # missing category 'b' should still exist in the output index
            idx = MultiIndex.from_product(
                [Categorical(['a', 'b'], ordered=ordered),
                 Categorical(['a', 'b', 'c'], ordered=ordered)],
                names=['missing', 'dense'])
            expected = DataFrame([0, 1, 2, np.nan, np.nan, np.nan],
                                 index=idx,
                                 columns=['values'])

            assert_frame_equal(grouped.apply(lambda x: np.mean(x)), expected)
            assert_frame_equal(grouped.mean(), expected)
            assert_frame_equal(grouped.agg(np.mean), expected)

            # but for transform we should still get back the original index
            idx = MultiIndex.from_product([['a'], ['a', 'b', 'c']],
                                          names=['missing', 'dense'])
            expected = Series(1, index=idx)
            assert_series_equal(grouped.apply(lambda x: 1), expected)
Example #5
0
    def test_groupby_categorical_index(self):

        levels = ['foo', 'bar', 'baz', 'qux']
        codes = np.random.randint(0, 4, size=20)
        cats = Categorical.from_codes(codes, levels, ordered=True)
        df = DataFrame(
            np.repeat(
                np.arange(20), 4).reshape(-1, 4), columns=list('abcd'))
        df['cats'] = cats

        # with a cat index
        result = df.set_index('cats').groupby(level=0).sum()
        expected = df[list('abcd')].groupby(cats.codes).sum()
        expected.index = CategoricalIndex(
            Categorical.from_codes(
                [0, 1, 2, 3], levels, ordered=True), name='cats')
        assert_frame_equal(result, expected)

        # with a cat column, should produce a cat index
        result = df.groupby('cats').sum()
        expected = df[list('abcd')].groupby(cats.codes).sum()
        expected.index = CategoricalIndex(
            Categorical.from_codes(
                [0, 1, 2, 3], levels, ordered=True), name='cats')
        assert_frame_equal(result, expected)
Example #6
0
    def test_multi_assign(self):

        # GH 3626, an assignement of a sub-df to a df
        df = DataFrame({'FC':['a','b','a','b','a','b'],
                        'PF':[0,0,0,0,1,1],
                        'col1':lrange(6),
                        'col2':lrange(6,12)})
        df.ix[1,0]=np.nan
        df2 = df.copy()

        mask=~df2.FC.isnull()
        cols=['col1', 'col2']

        dft = df2 * 2
        dft.ix[3,3] = np.nan

        expected = DataFrame({'FC':['a',np.nan,'a','b','a','b'],
                              'PF':[0,0,0,0,1,1],
                              'col1':Series([0,1,4,6,8,10]),
                              'col2':[12,7,16,np.nan,20,22]})


        # frame on rhs
        df2.ix[mask, cols]= dft.ix[mask, cols]
        assert_frame_equal(df2,expected)

        df2.ix[mask, cols]= dft.ix[mask, cols]
        assert_frame_equal(df2,expected)

        # with an ndarray on rhs
        df2 = df.copy()
        df2.ix[mask, cols]= dft.ix[mask, cols].values
        assert_frame_equal(df2,expected)
        df2.ix[mask, cols]= dft.ix[mask, cols].values
        assert_frame_equal(df2,expected)
Example #7
0
    def test_filter_multiple_timestamp(self):
        # GH 10114
        df = DataFrame({'A': np.arange(5, dtype='int64'),
                        'B': ['foo', 'bar', 'foo', 'bar', 'bar'],
                        'C': Timestamp('20130101')})

        grouped = df.groupby(['B', 'C'])

        result = grouped['A'].filter(lambda x: True)
        assert_series_equal(df['A'], result)

        result = grouped['A'].transform(len)
        expected = Series([2, 3, 2, 3, 3], name='A')
        assert_series_equal(result, expected)

        result = grouped.filter(lambda x: True)
        assert_frame_equal(df, result)

        result = grouped.transform('sum')
        expected = DataFrame({'A': [2, 8, 2, 8, 8]})
        assert_frame_equal(result, expected)

        result = grouped.transform(len)
        expected = DataFrame({'A': [2, 3, 2, 3, 3]})
        assert_frame_equal(result, expected)
Example #8
0
    def test_fromRecords_toRecords(self):
        # structured array
        K = 10

        recs = np.zeros(K, dtype='O,O,f8,f8')
        recs['f0'] = range(K / 2) * 2
        recs['f1'] = np.arange(K) / (K / 2)
        recs['f2'] = np.arange(K) * 2
        recs['f3'] = np.arange(K)

        lp = LongPanel.fromRecords(recs, 'f0', 'f1')
        self.assertEqual(len(lp.items), 2)

        lp = LongPanel.fromRecords(recs, 'f0', 'f1', exclude=['f2'])
        self.assertEqual(len(lp.items), 1)

        torecs = lp.toRecords()
        self.assertEqual(len(torecs.dtype.names), len(lp.items) + 2)

        # DataFrame
        df = DataFrame.fromRecords(recs)
        lp = LongPanel.fromRecords(df, 'f0', 'f1', exclude=['f2'])
        self.assertEqual(len(lp.items), 1)

        # dict of arrays
        series = DataFrame.fromRecords(recs)._series
        lp = LongPanel.fromRecords(series, 'f0', 'f1', exclude=['f2'])
        self.assertEqual(len(lp.items), 1)
        self.assert_('f2' in series)

        self.assertRaises(Exception, LongPanel.fromRecords, np.zeros((3, 3)),
                          0, 1)
Example #9
0
    def test_agg_nested_dicts(self):

        # API change for disallowing these types of nested dicts
        df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
                              'foo', 'bar', 'foo', 'foo'],
                        'B': ['one', 'one', 'two', 'two',
                              'two', 'two', 'one', 'two'],
                        'C': np.random.randn(8) + 1.0,
                        'D': np.arange(8)})

        g = df.groupby(['A', 'B'])

        def f():
            g.aggregate({'r1': {'C': ['mean', 'sum']},
                         'r2': {'D': ['mean', 'sum']}})

        self.assertRaises(SpecificationError, f)

        result = g.agg({'C': {'ra': ['mean', 'std']},
                        'D': {'rb': ['mean', 'std']}})
        expected = pd.concat([g['C'].mean(), g['C'].std(), g['D'].mean(),
                              g['D'].std()], axis=1)
        expected.columns = pd.MultiIndex.from_tuples([('ra', 'mean'), (
            'ra', 'std'), ('rb', 'mean'), ('rb', 'std')])
        assert_frame_equal(result, expected, check_like=True)

        # same name as the original column
        # GH9052
        expected = g['D'].agg({'result1': np.sum, 'result2': np.mean})
        expected = expected.rename(columns={'result1': 'D'})
        result = g['D'].agg({'D': np.sum, 'result2': np.mean})
        assert_frame_equal(result, expected, check_like=True)
Example #10
0
    def test_agg_compat(self):

        # GH 12334

        df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
                              'foo', 'bar', 'foo', 'foo'],
                        'B': ['one', 'one', 'two', 'two',
                              'two', 'two', 'one', 'two'],
                        'C': np.random.randn(8) + 1.0,
                        'D': np.arange(8)})

        g = df.groupby(['A', 'B'])

        expected = pd.concat([g['D'].sum(),
                              g['D'].std()],
                             axis=1)
        expected.columns = MultiIndex.from_tuples([('C', 'sum'),
                                                   ('C', 'std')])
        result = g['D'].agg({'C': ['sum', 'std']})
        assert_frame_equal(result, expected, check_like=True)

        expected = pd.concat([g['D'].sum(),
                              g['D'].std()],
                             axis=1)
        expected.columns = ['C', 'D']
        result = g['D'].agg({'C': 'sum', 'D': 'std'})
        assert_frame_equal(result, expected, check_like=True)
Example #11
0
    def test_agg_dict_parameter_cast_result_dtypes(self):
        # GH 12821

        df = DataFrame(
            {'class': ['A', 'A', 'B', 'B', 'C', 'C', 'D', 'D'],
             'time': date_range('1/1/2011', periods=8, freq='H')})
        df.loc[[0, 1, 2, 5], 'time'] = None

        # test for `first` function
        exp = df.loc[[0, 3, 4, 6]].set_index('class')
        grouped = df.groupby('class')
        assert_frame_equal(grouped.first(), exp)
        assert_frame_equal(grouped.agg('first'), exp)
        assert_frame_equal(grouped.agg({'time': 'first'}), exp)
        assert_series_equal(grouped.time.first(), exp['time'])
        assert_series_equal(grouped.time.agg('first'), exp['time'])

        # test for `last` function
        exp = df.loc[[0, 3, 4, 7]].set_index('class')
        grouped = df.groupby('class')
        assert_frame_equal(grouped.last(), exp)
        assert_frame_equal(grouped.agg('last'), exp)
        assert_frame_equal(grouped.agg({'time': 'last'}), exp)
        assert_series_equal(grouped.time.last(), exp['time'])
        assert_series_equal(grouped.time.agg('last'), exp['time'])
Example #12
0
    def test_apply_categorical_data(self):
        # GH 10138
        for ordered in [True, False]:
            dense = Categorical(list("abc"), ordered=ordered)
            # 'b' is in the categories but not in the list
            missing = Categorical(list("aaa"), categories=["a", "b"], ordered=ordered)
            values = np.arange(len(dense))
            df = DataFrame({"missing": missing, "dense": dense, "values": values})
            grouped = df.groupby(["missing", "dense"])

            # missing category 'b' should still exist in the output index
            idx = MultiIndex.from_product(
                [Categorical(["a", "b"], ordered=ordered), Categorical(["a", "b", "c"], ordered=ordered)],
                names=["missing", "dense"],
            )
            expected = DataFrame([0, 1, 2, np.nan, np.nan, np.nan], index=idx, columns=["values"])

            assert_frame_equal(grouped.apply(lambda x: np.mean(x)), expected)
            assert_frame_equal(grouped.mean(), expected)
            assert_frame_equal(grouped.agg(np.mean), expected)

            # but for transform we should still get back the original index
            idx = MultiIndex.from_product([["a"], ["a", "b", "c"]], names=["missing", "dense"])
            expected = Series(1, index=idx)
            assert_series_equal(grouped.apply(lambda x: 1), expected)
Example #13
0
    def test_groupby_datetime_categorical(self):
        # GH9049: ensure backward compatibility
        levels = pd.date_range("2014-01-01", periods=4)
        codes = np.random.randint(0, 4, size=100)

        cats = Categorical.from_codes(codes, levels, ordered=True)

        data = DataFrame(np.random.randn(100, 4))
        result = data.groupby(cats).mean()

        expected = data.groupby(np.asarray(cats)).mean()
        expected = expected.reindex(levels)
        expected.index = CategoricalIndex(expected.index, categories=expected.index, ordered=True)

        assert_frame_equal(result, expected)

        grouped = data.groupby(cats)
        desc_result = grouped.describe()

        idx = cats.codes.argsort()
        ord_labels = cats.take_nd(idx)
        ord_data = data.take(idx)
        expected = ord_data.groupby(ord_labels).describe()
        expected.index.names = [None, None]
        assert_frame_equal(desc_result, expected)
        tm.assert_index_equal(desc_result.index, expected.index)
        tm.assert_index_equal(desc_result.index.get_level_values(0), expected.index.get_level_values(0))

        # GH 10460
        expc = Categorical.from_codes(np.arange(4).repeat(8), levels, ordered=True)
        exp = CategoricalIndex(expc)
        self.assert_index_equal(desc_result.index.get_level_values(0), exp)
        exp = Index(["count", "mean", "std", "min", "25%", "50%", "75%", "max"] * 4)
        self.assert_index_equal(desc_result.index.get_level_values(1), exp)
Example #14
0
    def test_groupby_describe_categorical_columns(self):
        # GH 11558
        cats = pd.CategoricalIndex(["qux", "foo", "baz", "bar"], categories=["foo", "bar", "baz", "qux"], ordered=True)
        df = DataFrame(np.random.randn(20, 4), columns=cats)
        result = df.groupby([1, 2, 3, 4] * 5).describe()

        tm.assert_index_equal(result.columns, cats)
        tm.assert_categorical_equal(result.columns.values, cats.values)
Example #15
0
    def test_multi_nan_indexing(self):

        # GH 3588
        df = DataFrame({"a":['R1', 'R2', np.nan, 'R4'], 'b':["C1", "C2", "C3" , "C4"], "c":[10, 15, np.nan , 20]})
        result = df.set_index(['a','b'], drop=False)
        expected = DataFrame({"a":['R1', 'R2', np.nan, 'R4'], 'b':["C1", "C2", "C3" , "C4"], "c":[10, 15, np.nan , 20]},
                             index = [Index(['R1','R2',np.nan,'R4'],name='a'),Index(['C1','C2','C3','C4'],name='b')])
        assert_frame_equal(result,expected)
Example #16
0
    def test_ix_weird_slicing(self):
        ## http://stackoverflow.com/q/17056560/1240268
        df = DataFrame({'one' : [1, 2, 3, np.nan, np.nan], 'two' : [1, 2, 3, 4, 5]})
        df.ix[df['one']>1, 'two'] = -df['two']

        expected = DataFrame({'one': {0: 1.0, 1: 2.0, 2: 3.0, 3: nan, 4: nan},
                              'two': {0: 1, 1: -2, 2: -3, 3: 4, 4: 5}})
        assert_frame_equal(df, expected)
Example #17
0
    def test_xs_multiindex(self):

        # GH2903
        columns = MultiIndex.from_tuples([('a', 'foo'), ('a', 'bar'), ('b', 'hello'), ('b', 'world')], names=['lvl0', 'lvl1'])
        df = DataFrame(np.random.randn(4, 4), columns=columns)
        df.sortlevel(axis=1,inplace=True)
        result = df.xs('a', level='lvl0', axis=1)
        expected = df.iloc[:,0:2].loc[:,'a']
        assert_frame_equal(result,expected)
 def test_var_on_multiplegroups(self):
     df = DataFrame({'data1': np.random.randn(5),
                     'data2': np.random.randn(5),
                     'data3': np.random.randn(5),
                     'key1': ['a', 'a', 'b', 'b', 'a'],
                     'key2': ['one', 'two', 'one', 'two', 'one']})
     ddf = self.psc.from_data_frame(df)
     dgrouped = ddf.groupby(['key1', 'key2'])
     grouped = df.groupby(['key1', 'key2'])
     assert_frame_equal(dgrouped.var().collect(), grouped.var())
Example #19
0
    def test_setitem_dtype_upcast(self):
 
        # GH3216
        df = DataFrame([{"a": 1}, {"a": 3, "b": 2}])
        df['c'] = np.nan
        self.assert_(df['c'].dtype == np.float64)

        df.ix[0,'c'] = 'foo'
        expected = DataFrame([{"a": 1, "c" : 'foo'}, {"a": 3, "b": 2, "c" : np.nan}])
        assert_frame_equal(df,expected)
 def test_var_on_multiplegroups(self):
     pd_df = DataFrame({'data1': np.random.randn(5),
                        'data2': np.random.randn(5),
                        'data3': np.random.randn(5),
                        'key1': ['a', 'a', 'b', 'b', 'a'],
                        'key2': ['one', 'two', 'one', 'two', 'one']})
     sp_df = self.psc.from_pd_data_frame(pd_df)
     actual_grouped = sp_df.groupby(['key1', 'key2'])
     expected_grouped = pd_df.groupby(['key1', 'key2'])
     assert_frame_equal(actual_grouped.var().collect(),
                        expected_grouped.var())
Example #21
0
    def test_agg_item_by_item_raise_typeerror(self):
        from numpy.random import randint

        df = DataFrame(randint(10, size=(20, 10)))

        def raiseException(df):
            pprint_thing('----------------------------------------')
            pprint_thing(df.to_string())
            raise TypeError

        self.assertRaises(TypeError, df.groupby(0).agg, raiseException)
Example #22
0
    def test_filter_nan_is_false(self):
        df = DataFrame({'A': np.arange(8),
                        'B': list('aabbbbcc'),
                        'C': np.arange(8)})
        s = df['B']
        g_df = df.groupby(df['B'])
        g_s = s.groupby(s)

        f = lambda x: np.nan
        assert_frame_equal(g_df.filter(f), df.loc[[]])
        assert_series_equal(g_s.filter(f), s[[]])
Example #23
0
    def test_iloc_setitem_series(self):
        """ originally from test_series.py """
        df = DataFrame(np.random.randn(10, 4), index=list('abcdefghij'), columns=list('ABCD'))

        df.iloc[1,1] = 1
        result = df.iloc[1,1]
        self.assert_(result == 1)

        df.iloc[:,2:3] = 0
        expected = df.iloc[:,2:3]
        result = df.iloc[:,2:3]
        assert_frame_equal(result, expected)
Example #24
0
    def test_indexing_mixed_frame_bug(self):

        # GH3492
        df = DataFrame({"a": {1: "aaa", 2: "bbb", 3: "ccc"}, "b": {1: 111, 2: 222, 3: 333}})

        # this works, new column is created correctly
        df["test"] = df["a"].apply(lambda x: "_" if x == "aaa" else x)

        # this does not work, ie column test is not changed
        idx = df["test"] == "_"
        temp = df.ix[idx, "a"].apply(lambda x: "-----" if x == "aaa" else x)
        df.ix[idx, "test"] = temp
        self.assert_(df.iloc[0, 2] == "-----")
Example #25
0
    def test_indexing_mixed_frame_bug(self):

        # GH3492
        df=DataFrame({'a':{1:'aaa',2:'bbb',3:'ccc'},'b':{1:111,2:222,3:333}})

        # this works, new column is created correctly
        df['test']=df['a'].apply(lambda x: '_' if x=='aaa' else x)

        # this does not work, ie column test is not changed
        idx=df['test']=='_'
        temp=df.ix[idx,'a'].apply(lambda x: '-----' if x=='aaa' else x)
        df.ix[idx,'test']=temp
        self.assert_(df.iloc[0,2] == '-----')
Example #26
0
    def predict(self, beta=None, x=None, fill_value=None,
                fill_method=None, axis=0):
        """
        Parameters
        ----------
        beta : Series
        x : Series or DataFrame
        fill_value : scalar or dict, default None
        fill_method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None
        axis : {0, 1}, default 0
            See DataFrame.fillna for more details

        Notes
        -----
        1. If both fill_value and fill_method are None then NaNs are dropped
        (this is the default behavior)
        2. An intercept will be automatically added to the new_y_values if
           the model was fitted using an intercept

        Returns
        -------
        Series of predicted values
        """
        if beta is None and x is None:
            return self.y_predict

        if beta is None:
            beta = self.beta
        else:
            beta = beta.reindex(self.beta.index)
            if isnull(beta).any():
                raise ValueError('Must supply betas for same variables')

        if x is None:
            x = self._x
            orig_x = x
        else:
            orig_x = x
            if fill_value is None and fill_method is None:
                x = x.dropna(how='any')
            else:
                x = x.fillna(value=fill_value, method=fill_method, axis=axis)
            if isinstance(x, Series):
                x = DataFrame({'x': x})
            if self._intercept:
                x['intercept'] = 1.

            x = x.reindex(columns=self._x.columns)

        rs = np.dot(x.values, beta.values)
        return Series(rs, x.index).reindex(orig_x.index)
Example #27
0
    def test_filter_against_workaround(self):
        np.random.seed(0)
        # Series of ints
        s = Series(np.random.randint(0, 100, 1000))
        grouper = s.apply(lambda x: np.round(x, -1))
        grouped = s.groupby(grouper)
        f = lambda x: x.mean() > 10

        old_way = s[grouped.transform(f).astype('bool')]
        new_way = grouped.filter(f)
        assert_series_equal(new_way.sort_values(), old_way.sort_values())

        # Series of floats
        s = 100 * Series(np.random.random(1000))
        grouper = s.apply(lambda x: np.round(x, -1))
        grouped = s.groupby(grouper)
        f = lambda x: x.mean() > 10
        old_way = s[grouped.transform(f).astype('bool')]
        new_way = grouped.filter(f)
        assert_series_equal(new_way.sort_values(), old_way.sort_values())

        # Set up DataFrame of ints, floats, strings.
        from string import ascii_lowercase
        letters = np.array(list(ascii_lowercase))
        N = 1000
        random_letters = letters.take(np.random.randint(0, 26, N))
        df = DataFrame({'ints': Series(np.random.randint(0, 100, N)),
                        'floats': N / 10 * Series(np.random.random(N)),
                        'letters': Series(random_letters)})

        # Group by ints; filter on floats.
        grouped = df.groupby('ints')
        old_way = df[grouped.floats.
                     transform(lambda x: x.mean() > N / 20).astype('bool')]
        new_way = grouped.filter(lambda x: x['floats'].mean() > N / 20)
        assert_frame_equal(new_way, old_way)

        # Group by floats (rounded); filter on strings.
        grouper = df.floats.apply(lambda x: np.round(x, -1))
        grouped = df.groupby(grouper)
        old_way = df[grouped.letters.
                     transform(lambda x: len(x) < N / 10).astype('bool')]
        new_way = grouped.filter(lambda x: len(x.letters) < N / 10)
        assert_frame_equal(new_way, old_way)

        # Group by strings; filter on ints.
        grouped = df.groupby('letters')
        old_way = df[grouped.ints.
                     transform(lambda x: x.mean() > N / 20).astype('bool')]
        new_way = grouped.filter(lambda x: x['ints'].mean() > N / 20)
        assert_frame_equal(new_way, old_way)
Example #28
0
    def test_iloc_multiindex(self):
        df = DataFrame(np.random.randn(3, 3), columns=[[2, 2, 4], [6, 8, 10]], index=[[4, 4, 8], [8, 10, 12]])

        rs = df.iloc[2]
        xp = df.irow(2)
        assert_series_equal(rs, xp)

        rs = df.iloc[:, 2]
        xp = df.icol(2)
        assert_series_equal(rs, xp)

        rs = df.iloc[2, 2]
        xp = df.values[2, 2]
        self.assert_(rs == xp)
Example #29
0
    def test_agg_period_index(self):
        from pandas import period_range, PeriodIndex
        prng = period_range('2012-1-1', freq='M', periods=3)
        df = DataFrame(np.random.randn(3, 2), index=prng)
        rs = df.groupby(level=0).sum()
        tm.assertIsInstance(rs.index, PeriodIndex)

        # GH 3579
        index = period_range(start='1999-01', periods=5, freq='M')
        s1 = Series(np.random.rand(len(index)), index=index)
        s2 = Series(np.random.rand(len(index)), index=index)
        series = [('s1', s1), ('s2', s2)]
        df = DataFrame.from_items(series)
        grouped = df.groupby(df.index.month)
        list(grouped)
Example #30
0
    def test_astype_assignment_with_iloc(self):

        # GH4312
        df_orig = DataFrame([['1','2','3','.4',5,6.,'foo']],columns=list('ABCDEFG'))

        df = df_orig.copy()
        df.iloc[:,0:3] = df.iloc[:,0:3].astype(int)
        result = df.get_dtype_counts().sort_index()
        expected = Series({ 'int64' : 4, 'float64' : 1, 'object' : 2 }).sort_index()
        assert_series_equal(result,expected)

        df = df_orig.copy()
        df.iloc[:,0:3] = df.iloc[:,0:3].convert_objects(convert_numeric=True)
        result = df.get_dtype_counts().sort_index()
        expected = Series({ 'int64' : 4, 'float64' : 1, 'object' : 2 }).sort_index()
Example #31
0
def percentileRank(frame, column=None, kind='mean'):
    """
    Return score at percentile for each point in time (cross-section)

    Parameters
    ----------
    frame: DataFrame
    column: string or Series, optional
       Column name or specific Series to compute percentiles for.
       If not provided, percentiles are computed for all values at each
       point in time. Note that this can take a LONG time.
    kind: {'rank', 'weak', 'strict', 'mean'}, optional
        This optional parameter specifies the interpretation of the
        resulting score:

        - "rank": Average percentage ranking of score.  In case of
                  multiple matches, average the percentage rankings of
                  all matching scores.
        - "weak": This kind corresponds to the definition of a cumulative
                  distribution function.  A percentileofscore of 80%
                  means that 80% of values are less than or equal
                  to the provided score.
        - "strict": Similar to "weak", except that only values that are
                    strictly less than the given score are counted.
        - "mean": The average of the "weak" and "strict" scores, often used in
                  testing.  See

                  http://en.wikipedia.org/wiki/Percentile_rank

    Returns
    -------
    TimeSeries or DataFrame, depending on input
    """
    from pandas.compat.scipy import percentileofscore
    fun = lambda xs, score: percentileofscore(remove_na(xs), score, kind=kind)

    results = {}
    framet = frame.T
    if column is not None:
        if isinstance(column, Series):
            for date, xs in frame.T.items():
                results[date] = fun(xs, column.get(date, NaN))
        else:
            for date, xs in frame.T.items():
                results[date] = fun(xs, xs[column])
        results = Series(results)
    else:
        for column in frame.columns:
            for date, xs in framet.items():
                results.setdefault(date, {})[column] = fun(xs, xs[column])
        results = DataFrame(results).T
    return results
Example #32
0
def _parse_data(schema, rows):
    # TODO(db) Is dtype_map important? Was previously used to build a numpy array that built the pandas df
    # # see: http://pandas.pydata.org/pandas-docs/dev/missing_data.html#missing-data-casting-rules-and-indexing
    # dtype_map = {'INTEGER': np.dtype(float),
    #              'FLOAT': np.dtype(float),
    #              # This seems to be buggy without nanosecond indicator
    #              'TIMESTAMP': 'M8[ns]'}
    # col_dtypes = [dtype_map.get(field['type'], object) for field in schema['fields']]
    return DataFrame(
        OrderedDict([(field['name'], _parse_entry(field, row_cell))
                     for field, row_cell in zip(schema['fields'],
                                                row.get('f', []))])
        for row in rows)
Example #33
0
def _take_new_index(obj, indexer, new_index, axis=0):
    from pandas.core.api import Series, DataFrame

    if isinstance(obj, Series):
        new_values = algos.take_1d(obj.values, indexer)
        return Series(new_values, index=new_index, name=obj.name)
    elif isinstance(obj, DataFrame):
        if axis == 1:
            raise NotImplementedError("axis 1 is not supported")
        return DataFrame(obj._data.reindex_indexer(
            new_axis=new_index, indexer=indexer, axis=1))
    else:
        raise ValueError("'obj' should be either a Series or a DataFrame")
Example #34
0
    def test_filter_and_transform_with_non_unique_string_index(self):
        # GH4620
        index = list('bbbcbbab')
        df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3],
                        'tag': [23, 45, 62, 24, 45, 34, 25, 62]}, index=index)
        grouped_df = df.groupby('tag')
        ser = df['pid']
        grouped_ser = ser.groupby(df['tag'])
        expected_indexes = [1, 2, 4, 7]

        # Filter DataFrame
        actual = grouped_df.filter(lambda x: len(x) > 1)
        expected = df.iloc[expected_indexes]
        assert_frame_equal(actual, expected)

        actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
        expected = df.copy()
        expected.iloc[[0, 3, 5, 6]] = np.nan
        assert_frame_equal(actual, expected)

        # Filter Series
        actual = grouped_ser.filter(lambda x: len(x) > 1)
        expected = ser.take(expected_indexes)
        assert_series_equal(actual, expected)

        actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
        NA = np.nan
        expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name='pid')
        # ^ made manually because this can get confusing!
        assert_series_equal(actual, expected)

        # Transform Series
        actual = grouped_ser.transform(len)
        expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name='pid')
        assert_series_equal(actual, expected)

        # Transform (a column from) DataFrameGroupBy
        actual = grouped_df.pid.transform(len)
        assert_series_equal(actual, expected)
Example #35
0
def _take_new_index(obj, indexer, new_index, axis=0):
    from pandas.core.api import Series, DataFrame
    from pandas.core.internals import BlockManager

    if isinstance(obj, Series):
        new_values = com.take_1d(obj.values, indexer)
        return Series(new_values, index=new_index, name=obj.name)
    elif isinstance(obj, DataFrame):
        if axis == 1:
            raise NotImplementedError
        return DataFrame(obj._data.take(indexer, new_index=new_index, axis=1))
    else:
        raise NotImplementedError
    def test_agg_api(self):
        # Note: needs a very recent version of pandas to pass
        # TODO(holden): Pass this test if local fails
        # GH 6337
        # http://stackoverflow.com/questions/21706030/pandas-groupby-agg-function-column-dtype-error
        # different api for agg when passed custom function with mixed frame

        df = DataFrame({'data1': np.random.randn(5),
                        'data2': np.random.randn(5),
                        'key1': ['a', 'a', 'b', 'b', 'a'],
                        'key2': ['one', 'two', 'one', 'two', 'one']})
        ddf = self.psc.from_data_frame(df)
        dgrouped = ddf.groupby('key1')
        grouped = df.groupby('key1')

        def peak_to_peak(arr):
            return arr.max() - arr.min()

        expected = grouped.agg([peak_to_peak])
        expected.columns = ['data1', 'data2']
        result = dgrouped.agg(peak_to_peak).collect()
        assert_frame_equal(result, expected)
Example #37
0
    def createData2(self):
        y_data = [[1, np.NaN],
                  [2, 3],
                  [4, 5]]
        y_index = [datetime(2000, 1, 1),
                   datetime(2000, 1, 2),
                   datetime(2000, 1, 3)]
        y_cols = ['A', 'B']
        self.panel_y2 = DataFrame(np.array(y_data), index=y_index,
                                   columns=y_cols)

        x1_data = [[6, np.NaN],
                   [7, 8],
                   [9, 30],
                   [11, 12]]
        x1_index = [datetime(2000, 1, 1),
                    datetime(2000, 1, 2),
                    datetime(2000, 1, 3),
                    datetime(2000, 1, 4)]
        x1_cols = ['A', 'B']
        x1 = DataFrame(np.array(x1_data), index=x1_index,
                        columns=x1_cols)

        x2_data = [[13, 14, np.NaN],
                   [15, np.NaN, np.NaN],
                   [16, 17, 48],
                   [19, 20, 21],
                   [22, 23, 24]]
        x2_index = [datetime(2000, 1, 1),
                    datetime(2000, 1, 2),
                    datetime(2000, 1, 3),
                    datetime(2000, 1, 4),
                    datetime(2000, 1, 5)]
        x2_cols = ['C', 'A', 'B']
        x2 = DataFrame(np.array(x2_data), index=x2_index,
                        columns=x2_cols)

        self.panel_x2 = {'x1' : x1, 'x2' : x2}
Example #38
0
    def test_non_unique_loc(self):
        ## GH3659
        ## non-unique indexer with loc slice
        ## https://groups.google.com/forum/?fromgroups#!topic/pydata/zTm2No0crYs

        # these are going to raise becuase the we are non monotonic
        df = DataFrame({'A' : [1,2,3,4,5,6], 'B' : [3,4,5,6,7,8]}, index = [0,1,0,1,2,3])
        self.assertRaises(KeyError, df.loc.__getitem__, tuple([slice(1,None)]))
        self.assertRaises(KeyError, df.loc.__getitem__, tuple([slice(0,None)]))
        self.assertRaises(KeyError, df.loc.__getitem__, tuple([slice(1,2)]))

        # monotonic are ok
        df = DataFrame({'A' : [1,2,3,4,5,6], 'B' : [3,4,5,6,7,8]}, index = [0,1,0,1,2,3]).sort(axis=0)
        result = df.loc[1:]
        expected = DataFrame({'A' : [2,4,5,6], 'B' : [4, 6,7,8]}, index = [1,1,2,3])
        assert_frame_equal(result,expected)

        result = df.loc[0:]
        assert_frame_equal(result,df)

        result = df.loc[1:2]
        expected = DataFrame({'A' : [2,4,5], 'B' : [4,6,7]}, index = [1,1,2])
        assert_frame_equal(result,expected)
Example #39
0
    def query_results(self, jobs, results):
        """Parse query results and turn them into a NumPy DataFrame."""
        # see: http://pandas.pydata.org/pandas-docs/dev/missing_data.html#missing-data-casting-rules-and-indexing
        dtype_map = {
            'INTEGER': np.dtype(float),
            'FLOAT': np.dtype(float),
            'TIMESTAMP': 'M8[ns]'
        }  # This seems to be buggy without
        # nanosecond indicator

        # This might take some time, so let the user know the query is done
        self.update_status("PROCESSING")

        self.job_complete()

        self.type = "query"
        self.total_rows = int(results['totalRows'])
        self.bytes_processed = int(results['totalBytesProcessed'])

        fields = results['schema']['fields']
        col_types = [field['type'] for field in fields]
        col_names = [
            field['name'].encode('ascii', 'ignore') for field in fields
        ]
        col_dtypes = [dtype_map.get(field['type'], object) for field in fields]
        row_array = np.zeros((self.total_rows, ),
                             dtype=zip(col_names, col_dtypes))

        row_num = 0

        while 'rows' in results and row_num < self.total_rows:
            for row in results['rows']:
                entries = row.get('f', [])
                for col_num, field_type in enumerate(col_types):
                    field_value = BigQueryResult._parse_entry(
                        entries[col_num].get('v', ''), field_type)
                    row_array[row_num][col_num] = field_value

                row_num += 1

            page_token = results.get('pageToken', None)

            results = jobs.getQueryResults(
                projectId=secrets.BIGQUERY_PROJECT_ID,
                jobId=self.job_id,
                pageToken=page_token).execute()

        self.rows = DataFrame(row_array)

        self.animate()
Example #40
0
    def test_new_in0140(self):
        """
        Test new functionality in 0.14.0. This currently doesn't work.
        """
        # v0.14.0 whatsnew
        df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
        ddf = self.psc.from_data_frame(df)
        g = ddf.groupby('A')
        result = g.first().collect()
        expected = df.iloc[[1, 2]].set_index('A')
        assert_frame_equal(result, expected)

        expected = df.iloc[[1, 2]].set_index('A')
        result = g.nth(0, dropna='any').collect()
        assert_frame_equal(result, expected)
Example #41
0
    def test_indexing_mixed_frame_bug(self):

        # GH3492
        df = DataFrame({
            'a': {
                1: 'aaa',
                2: 'bbb',
                3: 'ccc'
            },
            'b': {
                1: 111,
                2: 222,
                3: 333
            }
        })

        # this works, new column is created correctly
        df['test'] = df['a'].apply(lambda x: '_' if x == 'aaa' else x)

        # this does not work, ie column test is not changed
        idx = df['test'] == '_'
        temp = df.ix[idx, 'a'].apply(lambda x: '-----' if x == 'aaa' else x)
        df.ix[idx, 'test'] = temp
        self.assert_(df.iloc[0, 2] == '-----')
Example #42
0
    def test_onecolumn_of_integer(self):
        '''
        GH 3628
        a column_of_integers dataframe should transfer well to sql
        '''
        mono_df=DataFrame([1 , 2], columns=['c0'])
        sql.write_frame(mono_df, con = self.db, name = 'mono_df')
        # computing the sum via sql
        con_x=self.db
        the_sum=sum([my_c0[0] for  my_c0 in con_x.execute("select * from mono_df")])
        # it should not fail, and gives 3 ( Issue #3628 )
        self.assertEqual(the_sum , 3)

        result = sql.read_frame("select * from mono_df",con_x)
        tm.assert_frame_equal(result,mono_df)
Example #43
0
    def test_agg_compat(self):

        # GH 12334

        df = DataFrame({
            'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
            'B': ['one', 'one', 'two', 'two', 'two', 'two', 'one', 'two'],
            'C':
            np.random.randn(8) + 1.0,
            'D':
            np.arange(8)
        })

        g = df.groupby(['A', 'B'])

        expected = pd.concat([g['D'].sum(), g['D'].std()], axis=1)
        expected.columns = MultiIndex.from_tuples([('C', 'sum'), ('C', 'std')])
        result = g['D'].agg({'C': ['sum', 'std']})
        assert_frame_equal(result, expected, check_like=True)

        expected = pd.concat([g['D'].sum(), g['D'].std()], axis=1)
        expected.columns = ['C', 'D']
        result = g['D'].agg({'C': 'sum', 'D': 'std'})
        assert_frame_equal(result, expected, check_like=True)
Example #44
0
    def test_level_groupby_get_group(self):
        # GH15155
        df = DataFrame(data=np.arange(2, 22, 2),
                       index=MultiIndex(
                           levels=[pd.CategoricalIndex(["a", "b"]),
                                   range(10)],
                           labels=[[0] * 5 + [1] * 5,
                                   range(10)],
                           names=["Index1", "Index2"]))
        g = df.groupby(level=["Index1"])

        # expected should equal test.loc[["a"]]
        # GH15166
        expected = DataFrame(data=np.arange(2, 12, 2),
                             index=pd.MultiIndex(levels=[
                                 pd.CategoricalIndex(["a", "b"]),
                                 range(5)
                             ],
                                                 labels=[[0] * 5,
                                                         range(5)],
                                                 names=["Index1", "Index2"]))
        result = g.get_group('a')

        assert_frame_equal(result, expected)
Example #45
0
    def test_ix_general(self):

        # ix general issues

        # GH 2817
        data = {
            'amount': {
                0: 700,
                1: 600,
                2: 222,
                3: 333,
                4: 444
            },
            'col': {
                0: 3.5,
                1: 3.5,
                2: 4.0,
                3: 4.0,
                4: 4.0
            },
            'year': {
                0: 2012,
                1: 2011,
                2: 2012,
                3: 2012,
                4: 2012
            }
        }
        df = DataFrame(data).set_index(keys=['col', 'year'])

        # this should raise correct error
        self.assertRaises(KeyError, df.ix.__getitem__, tuple([4.0, 2012]))

        # this is ok
        df.sortlevel(inplace=True)
        df.ix[(4.0, 2012)]
Example #46
0
    def setUp(self):
        import warnings
        warnings.filterwarnings(action='ignore', category=FutureWarning)

        self.series_ints   = Series(np.random.rand(4), index=range(0,8,2))
        self.frame_ints    = DataFrame(np.random.randn(4, 4), index=range(0, 8, 2), columns=range(0,12,3))
        self.panel_ints    = Panel(np.random.rand(4,4,4), items=range(0,8,2),major_axis=range(0,12,3),minor_axis=range(0,16,4))

        self.series_labels = Series(np.random.randn(4), index=list('abcd'))
        self.frame_labels  = DataFrame(np.random.randn(4, 4), index=list('abcd'), columns=list('ABCD'))
        self.panel_labels  = Panel(np.random.randn(4,4,4), items=list('abcd'), major_axis=list('ABCD'), minor_axis=list('ZYXW'))

        self.series_mixed  = Series(np.random.randn(4), index=[2, 4, 'null', 8])
        self.frame_mixed   = DataFrame(np.random.randn(4, 4), index=[2, 4, 'null', 8])
        self.panel_mixed   = Panel(np.random.randn(4,4,4), items=[2,4,'null',8])

        self.series_ts     = Series(np.random.randn(4), index=date_range('20130101', periods=4))
        self.frame_ts      = DataFrame(np.random.randn(4, 4), index=date_range('20130101', periods=4))
        self.panel_ts      = Panel(np.random.randn(4, 4, 4), items=date_range('20130101', periods=4))

        #self.series_floats = Series(np.random.randn(4), index=[1.00, 2.00, 3.00, 4.00])
        #self.frame_floats  = DataFrame(np.random.randn(4, 4), columns=[1.00, 2.00, 3.00, 4.00])
        #self.panel_floats  = Panel(np.random.rand(4,4,4), items = [1.00,2.00,3.00,4.00])

        self.frame_empty   = DataFrame({})
        self.series_empty  = Series({})
        self.panel_empty   = Panel({})

        # form agglomerates
        for o in self._objs:

            d = dict()
            for t in self._typs:
                d[t] = getattr(self,'%s_%s' % (o,t),None)

            setattr(self,o,d)
Example #47
0
    def test_bool_ops_warn_on_arithmetic(self):
        n = 10
        df = DataFrame({
            'a': np.random.rand(n) > 0.5,
            'b': np.random.rand(n) > 0.5
        })
        names = 'add', 'mul', 'sub'
        ops = '+', '*', '-'
        subs = {'+': '|', '*': '&', '-': '^'}
        sub_funcs = {'|': 'or_', '&': 'and_', '^': 'xor'}
        for op, name in zip(ops, names):
            f = getattr(operator, name)
            fe = getattr(operator, sub_funcs[subs[op]])

            if op == '-':
                # raises TypeError
                continue

            with tm.use_numexpr(True, min_elements=5):
                with tm.assert_produces_warning(check_stacklevel=False):
                    r = f(df, df)
                    e = fe(df, df)
                    tm.assert_frame_equal(r, e)

                with tm.assert_produces_warning(check_stacklevel=False):
                    r = f(df.a, df.b)
                    e = fe(df.a, df.b)
                    tm.assert_series_equal(r, e)

                with tm.assert_produces_warning(check_stacklevel=False):
                    r = f(df.a, True)
                    e = fe(df.a, True)
                    tm.assert_series_equal(r, e)

                with tm.assert_produces_warning(check_stacklevel=False):
                    r = f(False, df.a)
                    e = fe(False, df.a)
                    tm.assert_series_equal(r, e)

                with tm.assert_produces_warning(check_stacklevel=False):
                    r = f(False, df)
                    e = fe(False, df)
                    tm.assert_frame_equal(r, e)

                with tm.assert_produces_warning(check_stacklevel=False):
                    r = f(df, True)
                    e = fe(df, True)
                    tm.assert_frame_equal(r, e)
Example #48
0
File: ols.py Project: zfs002/pandas
def _filter_data(lhs, rhs, weights=None):
    """
    Cleans the input for single OLS.

    Parameters
    ----------
    lhs : Series
        Dependent variable in the regression.
    rhs : dict, whose values are Series, DataFrame, or dict
        Explanatory variables of the regression.
    weights : array-like, optional
        1d array of weights.  If None, equivalent to an unweighted OLS.

    Returns
    -------
    Series, DataFrame
        Cleaned lhs and rhs
    """
    if not isinstance(lhs, Series):
        if len(lhs) != len(rhs):
            raise AssertionError("length of lhs must equal length of rhs")
        lhs = Series(lhs, index=rhs.index)

    rhs = _combine_rhs(rhs)
    lhs = DataFrame({'__y__': lhs}, dtype=float)
    pre_filt_rhs = rhs.dropna(how='any')

    combined = rhs.join(lhs, how='outer')
    if weights is not None:
        combined['__weights__'] = weights

    valid = (combined.count(1) == len(combined.columns)).values
    index = combined.index
    combined = combined[valid]

    if weights is not None:
        filt_weights = combined.pop('__weights__')
    else:
        filt_weights = None

    filt_lhs = combined.pop('__y__')
    filt_rhs = combined

    if hasattr(filt_weights, 'to_dense'):
        filt_weights = filt_weights.to_dense()

    return (filt_lhs.to_dense(), filt_rhs.to_dense(), filt_weights,
            pre_filt_rhs.to_dense(), index, valid)
    def test_bool_ops_warn_on_arithmetic(self):
        n = 10
        df = DataFrame({
            "a": np.random.rand(n) > 0.5,
            "b": np.random.rand(n) > 0.5
        })
        names = "add", "mul", "sub"
        ops = "+", "*", "-"
        subs = {"+": "|", "*": "&", "-": "^"}
        sub_funcs = {"|": "or_", "&": "and_", "^": "xor"}
        for op, name in zip(ops, names):
            f = getattr(operator, name)
            fe = getattr(operator, sub_funcs[subs[op]])

            if op == "-":
                # raises TypeError
                continue

            with tm.use_numexpr(True, min_elements=5):
                with tm.assert_produces_warning(check_stacklevel=False):
                    r = f(df, df)
                    e = fe(df, df)
                    tm.assert_frame_equal(r, e)

                with tm.assert_produces_warning(check_stacklevel=False):
                    r = f(df.a, df.b)
                    e = fe(df.a, df.b)
                    tm.assert_series_equal(r, e)

                with tm.assert_produces_warning(check_stacklevel=False):
                    r = f(df.a, True)
                    e = fe(df.a, True)
                    tm.assert_series_equal(r, e)

                with tm.assert_produces_warning(check_stacklevel=False):
                    r = f(False, df.a)
                    e = fe(False, df.a)
                    tm.assert_series_equal(r, e)

                with tm.assert_produces_warning(check_stacklevel=False):
                    r = f(False, df)
                    e = fe(False, df)
                    tm.assert_frame_equal(r, e)

                with tm.assert_produces_warning(check_stacklevel=False):
                    r = f(df, True)
                    e = fe(df, True)
                    tm.assert_frame_equal(r, e)
Example #50
0
    def setUp(self):
        self.ts = tm.makeTimeSeries()

        self.seriesd = tm.getSeriesData()
        self.tsd = tm.getTimeSeriesData()
        self.frame = DataFrame(self.seriesd)
        self.tsframe = DataFrame(self.tsd)

        self.df = DataFrame({
            'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
            'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
            'C':
            np.random.randn(8),
            'D':
            np.random.randn(8)
        })

        self.df_mixed_floats = DataFrame({
            'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
            'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
            'C':
            np.random.randn(8),
            'D':
            np.array(np.random.randn(8), dtype='float32')
        })

        index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
                                   ['one', 'two', 'three']],
                           labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
                                   [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
                           names=['first', 'second'])
        self.mframe = DataFrame(np.random.randn(10, 3),
                                index=index,
                                columns=['A', 'B', 'C'])

        self.three_group = DataFrame({
            'A': [
                'foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar', 'foo',
                'foo', 'foo'
            ],
            'B': [
                'one', 'one', 'one', 'two', 'one', 'one', 'one', 'two', 'two',
                'two', 'one'
            ],
            'C': [
                'dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny',
                'dull', 'shiny', 'shiny', 'shiny'
            ],
            'D':
            np.random.randn(11),
            'E':
            np.random.randn(11),
            'F':
            np.random.randn(11)
        })
Example #51
0
    def read_sql(self, sql, index_col=None, coerce_float=True, params=None,
                 parse_dates=None):
        args = _convert_params(sql, params)
        cursor = self.execute(*args)
        columns = [col_desc[0] for col_desc in cursor.description]
        data = self._fetchall_as_list(cursor)
        cursor.close()

        data_frame = DataFrame.from_records(
            data, columns=columns, coerce_float=coerce_float)

        _parse_date_columns(data_frame, parse_dates)

        if index_col is not None:
            data_frame.set_index(index_col, inplace=True)
        return data_frame
Example #52
0
    def setUp(self):
        arr = randn(N)
        arr[self._nan_locs] = np.NaN

        self.arr = arr
        self.rng = DateRange(datetime(2009, 1, 1), periods=N)

        self.series = Series(arr.copy(), index=self.rng)

        self.frame = DataFrame(randn(N, K),
                               index=self.rng,
                               columns=np.arange(K))

        self.matrix = DataMatrix(randn(N, K),
                                 index=self.rng,
                                 columns=np.arange(K))
    def test_bool_ops_warn_on_arithmetic(self, op_str, opname):
        n = 10
        df = DataFrame({
            "a": np.random.rand(n) > 0.5,
            "b": np.random.rand(n) > 0.5
        })

        subs = {"+": "|", "*": "&", "-": "^"}
        sub_funcs = {"|": "or_", "&": "and_", "^": "xor"}

        f = getattr(operator, opname)
        fe = getattr(operator, sub_funcs[subs[op_str]])

        if op_str == "-":
            # raises TypeError
            return

        with tm.use_numexpr(True, min_elements=5):
            with tm.assert_produces_warning(check_stacklevel=False):
                r = f(df, df)
                e = fe(df, df)
                tm.assert_frame_equal(r, e)

            with tm.assert_produces_warning(check_stacklevel=False):
                r = f(df.a, df.b)
                e = fe(df.a, df.b)
                tm.assert_series_equal(r, e)

            with tm.assert_produces_warning(check_stacklevel=False):
                r = f(df.a, True)
                e = fe(df.a, True)
                tm.assert_series_equal(r, e)

            with tm.assert_produces_warning(check_stacklevel=False):
                r = f(False, df.a)
                e = fe(False, df.a)
                tm.assert_series_equal(r, e)

            with tm.assert_produces_warning(check_stacklevel=False):
                r = f(False, df)
                e = fe(False, df)
                tm.assert_frame_equal(r, e)

            with tm.assert_produces_warning(check_stacklevel=False):
                r = f(df, True)
                e = fe(df, True)
                tm.assert_frame_equal(r, e)
Example #54
0
    def read_sql(self, sql, index_col=None, coerce_float=True,
                 parse_dates=None, params=None):
        args = _convert_params(sql, params)

        result = self.execute(*args)
        data = result.fetchall()
        columns = result.keys()

        data_frame = DataFrame.from_records(
            data, columns=columns, coerce_float=coerce_float)

        _parse_date_columns(data_frame, parse_dates)

        if index_col is not None:
            data_frame.set_index(index_col, inplace=True)

        return data_frame
Example #55
0
def _flex_binary_moment(arg1, arg2, f, pairwise=False):
    if not (isinstance(arg1, (np.ndarray, Series, DataFrame))
            and isinstance(arg2, (np.ndarray, Series, DataFrame))):
        raise TypeError("arguments to moment function must be of type "
                        "np.ndarray/Series/DataFrame")

    if isinstance(arg1, (np.ndarray, Series)) and \
            isinstance(arg2, (np.ndarray,Series)):
        X, Y = _prep_binary(arg1, arg2)
        return f(X, Y)
    elif isinstance(arg1, DataFrame):
        results = {}
        if isinstance(arg2, DataFrame):
            X, Y = arg1.align(arg2, join='outer')
            if pairwise is False:
                X = X + 0 * Y
                Y = Y + 0 * X
                res_columns = arg1.columns.union(arg2.columns)
                for col in res_columns:
                    if col in X and col in Y:
                        results[col] = f(X[col], Y[col])
            elif pairwise is True:
                results = defaultdict(dict)
                for i, k1 in enumerate(arg1.columns):
                    for j, k2 in enumerate(arg2.columns):
                        if j < i and arg2 is arg1:
                            # Symmetric case
                            results[k1][k2] = results[k2][k1]
                        else:
                            results[k1][k2] = f(
                                *_prep_binary(arg1[k1], arg2[k2]))
                return Panel.from_dict(results).swapaxes('items', 'major')
            else:
                raise ValueError("'pairwise' is not True/False")
        else:
            res_columns = arg1.columns
            X, Y = arg1.align(arg2, axis=0, join='outer')
            results = {}

            for col in res_columns:
                results[col] = f(X[col], Y)

        return DataFrame(results, index=X.index, columns=res_columns)
    else:
        return _flex_binary_moment(arg2, arg1, f)
Example #56
0
    def test_bool_ops_raise_on_arithmetic(self):
        df = DataFrame({
            'a': np.random.rand(10) > 0.5,
            'b': np.random.rand(10) > 0.5
        })
        names = 'add', 'mul', 'sub', 'div', 'truediv', 'floordiv', 'pow'
        ops = '+', '*', '-', '/', '/', '//', '**'
        msg = 'operator %r not implemented for bool dtypes'
        for op, name in zip(ops, names):
            if not compat.PY3 or name != 'div':
                f = getattr(operator, name)
                err_msg = re.escape(msg % op)

                with tm.assertRaisesRegexp(NotImplementedError, err_msg):
                    f(df, df)

                with tm.assertRaisesRegexp(NotImplementedError, err_msg):
                    f(df.a, df.b)
Example #57
0
        def _check(method):
            series = self.frame[1]

            res = method(series, self.frame, 10)
            res2 = method(self.frame, series, 10)
            exp = self.frame.apply(lambda x: method(series, x, 10))

            tm.assert_frame_equal(res, exp)
            tm.assert_frame_equal(res2, exp)

            frame2 = self.frame.copy()
            frame2.values[:] = np.random.randn(*frame2.shape)

            res3 = method(self.frame, frame2, 10)
            exp = DataFrame(
                dict((k, method(self.frame[k], frame2[k], 10))
                     for k in self.frame))
            tm.assert_frame_equal(res3, exp)
Example #58
0
def _take_new_index(obj, indexer, new_index, axis=0):
    from pandas.core.api import Series, DataFrame
    from pandas.core.internals import BlockManager

    if isinstance(obj, Series):
        new_values = com.take_1d(obj.values, indexer)
        return Series(new_values, index=new_index, name=obj.name)
    elif isinstance(obj, DataFrame):
        if axis == 1:
            raise NotImplementedError
        data = obj._data

        new_blocks = [b.take(indexer, axis=1) for b in data.blocks]
        new_axes = list(data.axes)
        new_axes[1] = new_index
        new_data = BlockManager(new_blocks, new_axes)
        return DataFrame(new_data)
    else:
        raise NotImplementedError
Example #59
0
def _rollingMoment(arg, window, func, minp, time_rule=None):
    """
    Rolling statistical measure using supplied function. Designed to be
    used with passed-in Cython array-based functions.

    Parameters
    ----------
    arg :  DataFrame or numpy ndarray-like
    window : Number of observations used for calculating statistic
    func : Cython function to compute rolling statistic on raw series
    minp : int
        Minimum number of observations required to have a value
    """
    types = (DataFrame, DataMatrix, Series)
    if time_rule is not None and isinstance(arg, types):
        # Conform to whatever frequency needed.
        arg = arg.asfreq(time_rule)

    if isinstance(arg, DataMatrix):
        T, N = arg.values.shape
        resultMatrix = np.empty((T, N), dtype=arg.values.dtype)
        arg.values[np.isinf(arg.values)] = NaN
        for i in range(N):
            resultMatrix[:, i] = func(arg.values[:, i], window, minp=minp)
        output = DataMatrix(resultMatrix, index=arg.index, columns=arg.columns)

    elif isinstance(arg, DataFrame):
        output = DataFrame(index=arg.index)
        for col, series in arg.iteritems():
            series[np.isinf(series)] = NaN
            output[col] = Series(func(series, window, minp=minp),
                                 index=series.index)
    elif isinstance(arg, Series):
        arg[np.isinf(arg)] = NaN
        output = Series(func(arg, window, minp=minp), index=arg.index)
    else:
        try:
            assert (hasattr(arg, '__iter__'))
        except AssertionError:
            raise AssertionError('Expected DataFrame or array-like argument')
        arg[np.isinf(arg)] = NaN
        output = func(arg, window, minp=minp)
    return output
Example #60
0
    def test_xs_multiindex(self):

        # GH2903
        columns = MultiIndex.from_tuples([('a', 'foo'), ('a', 'bar'), ('b', 'hello'), ('b', 'world')], names=['lvl0', 'lvl1'])
        df = DataFrame(np.random.randn(4, 4), columns=columns)
        df.sortlevel(axis=1,inplace=True)
        result = df.xs('a', level='lvl0', axis=1)
        expected = df.iloc[:,0:2].loc[:,'a']
        assert_frame_equal(result,expected)

        result = df.xs('foo', level='lvl1', axis=1)
        expected = df.iloc[:, 1:2].copy()
        expected.columns = expected.columns.droplevel('lvl1')
        assert_frame_equal(result, expected)