def test_groupby_categorical(self): levels = ["foo", "bar", "baz", "qux"] codes = np.random.randint(0, 4, size=100) cats = Categorical.from_codes(codes, levels, ordered=True) data = DataFrame(np.random.randn(100, 4)) result = data.groupby(cats).mean() expected = data.groupby(np.asarray(cats)).mean() exp_idx = CategoricalIndex(levels, categories=cats.categories, ordered=True) expected = expected.reindex(exp_idx) assert_frame_equal(result, expected) grouped = data.groupby(cats) desc_result = grouped.describe() idx = cats.codes.argsort() ord_labels = np.asarray(cats).take(idx) ord_data = data.take(idx) exp_cats = Categorical(ord_labels, ordered=True, categories=["foo", "bar", "baz", "qux"]) expected = ord_data.groupby(exp_cats, sort=False).describe() expected.index.names = [None, None] assert_frame_equal(desc_result, expected) # GH 10460 expc = Categorical.from_codes(np.arange(4).repeat(8), levels, ordered=True) exp = CategoricalIndex(expc) self.assert_index_equal(desc_result.index.get_level_values(0), exp) exp = Index(["count", "mean", "std", "min", "25%", "50%", "75%", "max"] * 4) self.assert_index_equal(desc_result.index.get_level_values(1), exp)
def test_filter_using_len(self): # BUG GH4447 df = DataFrame({'A': np.arange(8), 'B': list('aabbbbcc'), 'C': np.arange(8)}) grouped = df.groupby('B') actual = grouped.filter(lambda x: len(x) > 2) expected = DataFrame( {'A': np.arange(2, 6), 'B': list('bbbb'), 'C': np.arange(2, 6)}, index=np.arange(2, 6)) assert_frame_equal(actual, expected) actual = grouped.filter(lambda x: len(x) > 4) expected = df.ix[[]] assert_frame_equal(actual, expected) # Series have always worked properly, but we'll test anyway. s = df['B'] grouped = s.groupby(s) actual = grouped.filter(lambda x: len(x) > 2) expected = Series(4 * ['b'], index=np.arange(2, 6), name='B') assert_series_equal(actual, expected) actual = grouped.filter(lambda x: len(x) > 4) expected = s[[]] assert_series_equal(actual, expected)
def test_fromRecords_toRecords(self): # structured array K = 10 recs = np.zeros(K, dtype="O,O,f8,f8") recs["f0"] = range(K / 2) * 2 recs["f1"] = np.arange(K) / (K / 2) recs["f2"] = np.arange(K) * 2 recs["f3"] = np.arange(K) lp = LongPanel.fromRecords(recs, "f0", "f1") self.assertEqual(len(lp.items), 2) lp = LongPanel.fromRecords(recs, "f0", "f1", exclude=["f2"]) self.assertEqual(len(lp.items), 1) torecs = lp.toRecords() self.assertEqual(len(torecs.dtype.names), len(lp.items) + 2) # DataFrame df = DataFrame.from_records(recs) lp = LongPanel.fromRecords(df, "f0", "f1", exclude=["f2"]) self.assertEqual(len(lp.items), 1) # dict of arrays series = DataFrame.from_records(recs)._series lp = LongPanel.fromRecords(series, "f0", "f1", exclude=["f2"]) self.assertEqual(len(lp.items), 1) self.assert_("f2" in series) self.assertRaises(Exception, LongPanel.fromRecords, np.zeros((3, 3)), 0, 1)
def test_apply_categorical_data(self): # GH 10138 for ordered in [True, False]: dense = Categorical(list('abc'), ordered=ordered) # 'b' is in the categories but not in the list missing = Categorical( list('aaa'), categories=['a', 'b'], ordered=ordered) values = np.arange(len(dense)) df = DataFrame({'missing': missing, 'dense': dense, 'values': values}) grouped = df.groupby(['missing', 'dense']) # missing category 'b' should still exist in the output index idx = MultiIndex.from_product( [Categorical(['a', 'b'], ordered=ordered), Categorical(['a', 'b', 'c'], ordered=ordered)], names=['missing', 'dense']) expected = DataFrame([0, 1, 2, np.nan, np.nan, np.nan], index=idx, columns=['values']) assert_frame_equal(grouped.apply(lambda x: np.mean(x)), expected) assert_frame_equal(grouped.mean(), expected) assert_frame_equal(grouped.agg(np.mean), expected) # but for transform we should still get back the original index idx = MultiIndex.from_product([['a'], ['a', 'b', 'c']], names=['missing', 'dense']) expected = Series(1, index=idx) assert_series_equal(grouped.apply(lambda x: 1), expected)
def test_groupby_categorical_index(self): levels = ['foo', 'bar', 'baz', 'qux'] codes = np.random.randint(0, 4, size=20) cats = Categorical.from_codes(codes, levels, ordered=True) df = DataFrame( np.repeat( np.arange(20), 4).reshape(-1, 4), columns=list('abcd')) df['cats'] = cats # with a cat index result = df.set_index('cats').groupby(level=0).sum() expected = df[list('abcd')].groupby(cats.codes).sum() expected.index = CategoricalIndex( Categorical.from_codes( [0, 1, 2, 3], levels, ordered=True), name='cats') assert_frame_equal(result, expected) # with a cat column, should produce a cat index result = df.groupby('cats').sum() expected = df[list('abcd')].groupby(cats.codes).sum() expected.index = CategoricalIndex( Categorical.from_codes( [0, 1, 2, 3], levels, ordered=True), name='cats') assert_frame_equal(result, expected)
def test_multi_assign(self): # GH 3626, an assignement of a sub-df to a df df = DataFrame({'FC':['a','b','a','b','a','b'], 'PF':[0,0,0,0,1,1], 'col1':lrange(6), 'col2':lrange(6,12)}) df.ix[1,0]=np.nan df2 = df.copy() mask=~df2.FC.isnull() cols=['col1', 'col2'] dft = df2 * 2 dft.ix[3,3] = np.nan expected = DataFrame({'FC':['a',np.nan,'a','b','a','b'], 'PF':[0,0,0,0,1,1], 'col1':Series([0,1,4,6,8,10]), 'col2':[12,7,16,np.nan,20,22]}) # frame on rhs df2.ix[mask, cols]= dft.ix[mask, cols] assert_frame_equal(df2,expected) df2.ix[mask, cols]= dft.ix[mask, cols] assert_frame_equal(df2,expected) # with an ndarray on rhs df2 = df.copy() df2.ix[mask, cols]= dft.ix[mask, cols].values assert_frame_equal(df2,expected) df2.ix[mask, cols]= dft.ix[mask, cols].values assert_frame_equal(df2,expected)
def test_filter_multiple_timestamp(self): # GH 10114 df = DataFrame({'A': np.arange(5, dtype='int64'), 'B': ['foo', 'bar', 'foo', 'bar', 'bar'], 'C': Timestamp('20130101')}) grouped = df.groupby(['B', 'C']) result = grouped['A'].filter(lambda x: True) assert_series_equal(df['A'], result) result = grouped['A'].transform(len) expected = Series([2, 3, 2, 3, 3], name='A') assert_series_equal(result, expected) result = grouped.filter(lambda x: True) assert_frame_equal(df, result) result = grouped.transform('sum') expected = DataFrame({'A': [2, 8, 2, 8, 8]}) assert_frame_equal(result, expected) result = grouped.transform(len) expected = DataFrame({'A': [2, 3, 2, 3, 3]}) assert_frame_equal(result, expected)
def test_fromRecords_toRecords(self): # structured array K = 10 recs = np.zeros(K, dtype='O,O,f8,f8') recs['f0'] = range(K / 2) * 2 recs['f1'] = np.arange(K) / (K / 2) recs['f2'] = np.arange(K) * 2 recs['f3'] = np.arange(K) lp = LongPanel.fromRecords(recs, 'f0', 'f1') self.assertEqual(len(lp.items), 2) lp = LongPanel.fromRecords(recs, 'f0', 'f1', exclude=['f2']) self.assertEqual(len(lp.items), 1) torecs = lp.toRecords() self.assertEqual(len(torecs.dtype.names), len(lp.items) + 2) # DataFrame df = DataFrame.fromRecords(recs) lp = LongPanel.fromRecords(df, 'f0', 'f1', exclude=['f2']) self.assertEqual(len(lp.items), 1) # dict of arrays series = DataFrame.fromRecords(recs)._series lp = LongPanel.fromRecords(series, 'f0', 'f1', exclude=['f2']) self.assertEqual(len(lp.items), 1) self.assert_('f2' in series) self.assertRaises(Exception, LongPanel.fromRecords, np.zeros((3, 3)), 0, 1)
def test_agg_nested_dicts(self): # API change for disallowing these types of nested dicts df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], 'B': ['one', 'one', 'two', 'two', 'two', 'two', 'one', 'two'], 'C': np.random.randn(8) + 1.0, 'D': np.arange(8)}) g = df.groupby(['A', 'B']) def f(): g.aggregate({'r1': {'C': ['mean', 'sum']}, 'r2': {'D': ['mean', 'sum']}}) self.assertRaises(SpecificationError, f) result = g.agg({'C': {'ra': ['mean', 'std']}, 'D': {'rb': ['mean', 'std']}}) expected = pd.concat([g['C'].mean(), g['C'].std(), g['D'].mean(), g['D'].std()], axis=1) expected.columns = pd.MultiIndex.from_tuples([('ra', 'mean'), ( 'ra', 'std'), ('rb', 'mean'), ('rb', 'std')]) assert_frame_equal(result, expected, check_like=True) # same name as the original column # GH9052 expected = g['D'].agg({'result1': np.sum, 'result2': np.mean}) expected = expected.rename(columns={'result1': 'D'}) result = g['D'].agg({'D': np.sum, 'result2': np.mean}) assert_frame_equal(result, expected, check_like=True)
def test_agg_compat(self): # GH 12334 df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], 'B': ['one', 'one', 'two', 'two', 'two', 'two', 'one', 'two'], 'C': np.random.randn(8) + 1.0, 'D': np.arange(8)}) g = df.groupby(['A', 'B']) expected = pd.concat([g['D'].sum(), g['D'].std()], axis=1) expected.columns = MultiIndex.from_tuples([('C', 'sum'), ('C', 'std')]) result = g['D'].agg({'C': ['sum', 'std']}) assert_frame_equal(result, expected, check_like=True) expected = pd.concat([g['D'].sum(), g['D'].std()], axis=1) expected.columns = ['C', 'D'] result = g['D'].agg({'C': 'sum', 'D': 'std'}) assert_frame_equal(result, expected, check_like=True)
def test_agg_dict_parameter_cast_result_dtypes(self): # GH 12821 df = DataFrame( {'class': ['A', 'A', 'B', 'B', 'C', 'C', 'D', 'D'], 'time': date_range('1/1/2011', periods=8, freq='H')}) df.loc[[0, 1, 2, 5], 'time'] = None # test for `first` function exp = df.loc[[0, 3, 4, 6]].set_index('class') grouped = df.groupby('class') assert_frame_equal(grouped.first(), exp) assert_frame_equal(grouped.agg('first'), exp) assert_frame_equal(grouped.agg({'time': 'first'}), exp) assert_series_equal(grouped.time.first(), exp['time']) assert_series_equal(grouped.time.agg('first'), exp['time']) # test for `last` function exp = df.loc[[0, 3, 4, 7]].set_index('class') grouped = df.groupby('class') assert_frame_equal(grouped.last(), exp) assert_frame_equal(grouped.agg('last'), exp) assert_frame_equal(grouped.agg({'time': 'last'}), exp) assert_series_equal(grouped.time.last(), exp['time']) assert_series_equal(grouped.time.agg('last'), exp['time'])
def test_apply_categorical_data(self): # GH 10138 for ordered in [True, False]: dense = Categorical(list("abc"), ordered=ordered) # 'b' is in the categories but not in the list missing = Categorical(list("aaa"), categories=["a", "b"], ordered=ordered) values = np.arange(len(dense)) df = DataFrame({"missing": missing, "dense": dense, "values": values}) grouped = df.groupby(["missing", "dense"]) # missing category 'b' should still exist in the output index idx = MultiIndex.from_product( [Categorical(["a", "b"], ordered=ordered), Categorical(["a", "b", "c"], ordered=ordered)], names=["missing", "dense"], ) expected = DataFrame([0, 1, 2, np.nan, np.nan, np.nan], index=idx, columns=["values"]) assert_frame_equal(grouped.apply(lambda x: np.mean(x)), expected) assert_frame_equal(grouped.mean(), expected) assert_frame_equal(grouped.agg(np.mean), expected) # but for transform we should still get back the original index idx = MultiIndex.from_product([["a"], ["a", "b", "c"]], names=["missing", "dense"]) expected = Series(1, index=idx) assert_series_equal(grouped.apply(lambda x: 1), expected)
def test_groupby_datetime_categorical(self): # GH9049: ensure backward compatibility levels = pd.date_range("2014-01-01", periods=4) codes = np.random.randint(0, 4, size=100) cats = Categorical.from_codes(codes, levels, ordered=True) data = DataFrame(np.random.randn(100, 4)) result = data.groupby(cats).mean() expected = data.groupby(np.asarray(cats)).mean() expected = expected.reindex(levels) expected.index = CategoricalIndex(expected.index, categories=expected.index, ordered=True) assert_frame_equal(result, expected) grouped = data.groupby(cats) desc_result = grouped.describe() idx = cats.codes.argsort() ord_labels = cats.take_nd(idx) ord_data = data.take(idx) expected = ord_data.groupby(ord_labels).describe() expected.index.names = [None, None] assert_frame_equal(desc_result, expected) tm.assert_index_equal(desc_result.index, expected.index) tm.assert_index_equal(desc_result.index.get_level_values(0), expected.index.get_level_values(0)) # GH 10460 expc = Categorical.from_codes(np.arange(4).repeat(8), levels, ordered=True) exp = CategoricalIndex(expc) self.assert_index_equal(desc_result.index.get_level_values(0), exp) exp = Index(["count", "mean", "std", "min", "25%", "50%", "75%", "max"] * 4) self.assert_index_equal(desc_result.index.get_level_values(1), exp)
def test_groupby_describe_categorical_columns(self): # GH 11558 cats = pd.CategoricalIndex(["qux", "foo", "baz", "bar"], categories=["foo", "bar", "baz", "qux"], ordered=True) df = DataFrame(np.random.randn(20, 4), columns=cats) result = df.groupby([1, 2, 3, 4] * 5).describe() tm.assert_index_equal(result.columns, cats) tm.assert_categorical_equal(result.columns.values, cats.values)
def test_multi_nan_indexing(self): # GH 3588 df = DataFrame({"a":['R1', 'R2', np.nan, 'R4'], 'b':["C1", "C2", "C3" , "C4"], "c":[10, 15, np.nan , 20]}) result = df.set_index(['a','b'], drop=False) expected = DataFrame({"a":['R1', 'R2', np.nan, 'R4'], 'b':["C1", "C2", "C3" , "C4"], "c":[10, 15, np.nan , 20]}, index = [Index(['R1','R2',np.nan,'R4'],name='a'),Index(['C1','C2','C3','C4'],name='b')]) assert_frame_equal(result,expected)
def test_ix_weird_slicing(self): ## http://stackoverflow.com/q/17056560/1240268 df = DataFrame({'one' : [1, 2, 3, np.nan, np.nan], 'two' : [1, 2, 3, 4, 5]}) df.ix[df['one']>1, 'two'] = -df['two'] expected = DataFrame({'one': {0: 1.0, 1: 2.0, 2: 3.0, 3: nan, 4: nan}, 'two': {0: 1, 1: -2, 2: -3, 3: 4, 4: 5}}) assert_frame_equal(df, expected)
def test_xs_multiindex(self): # GH2903 columns = MultiIndex.from_tuples([('a', 'foo'), ('a', 'bar'), ('b', 'hello'), ('b', 'world')], names=['lvl0', 'lvl1']) df = DataFrame(np.random.randn(4, 4), columns=columns) df.sortlevel(axis=1,inplace=True) result = df.xs('a', level='lvl0', axis=1) expected = df.iloc[:,0:2].loc[:,'a'] assert_frame_equal(result,expected)
def test_var_on_multiplegroups(self): df = DataFrame({'data1': np.random.randn(5), 'data2': np.random.randn(5), 'data3': np.random.randn(5), 'key1': ['a', 'a', 'b', 'b', 'a'], 'key2': ['one', 'two', 'one', 'two', 'one']}) ddf = self.psc.from_data_frame(df) dgrouped = ddf.groupby(['key1', 'key2']) grouped = df.groupby(['key1', 'key2']) assert_frame_equal(dgrouped.var().collect(), grouped.var())
def test_setitem_dtype_upcast(self): # GH3216 df = DataFrame([{"a": 1}, {"a": 3, "b": 2}]) df['c'] = np.nan self.assert_(df['c'].dtype == np.float64) df.ix[0,'c'] = 'foo' expected = DataFrame([{"a": 1, "c" : 'foo'}, {"a": 3, "b": 2, "c" : np.nan}]) assert_frame_equal(df,expected)
def test_var_on_multiplegroups(self): pd_df = DataFrame({'data1': np.random.randn(5), 'data2': np.random.randn(5), 'data3': np.random.randn(5), 'key1': ['a', 'a', 'b', 'b', 'a'], 'key2': ['one', 'two', 'one', 'two', 'one']}) sp_df = self.psc.from_pd_data_frame(pd_df) actual_grouped = sp_df.groupby(['key1', 'key2']) expected_grouped = pd_df.groupby(['key1', 'key2']) assert_frame_equal(actual_grouped.var().collect(), expected_grouped.var())
def test_agg_item_by_item_raise_typeerror(self): from numpy.random import randint df = DataFrame(randint(10, size=(20, 10))) def raiseException(df): pprint_thing('----------------------------------------') pprint_thing(df.to_string()) raise TypeError self.assertRaises(TypeError, df.groupby(0).agg, raiseException)
def test_filter_nan_is_false(self): df = DataFrame({'A': np.arange(8), 'B': list('aabbbbcc'), 'C': np.arange(8)}) s = df['B'] g_df = df.groupby(df['B']) g_s = s.groupby(s) f = lambda x: np.nan assert_frame_equal(g_df.filter(f), df.loc[[]]) assert_series_equal(g_s.filter(f), s[[]])
def test_iloc_setitem_series(self): """ originally from test_series.py """ df = DataFrame(np.random.randn(10, 4), index=list('abcdefghij'), columns=list('ABCD')) df.iloc[1,1] = 1 result = df.iloc[1,1] self.assert_(result == 1) df.iloc[:,2:3] = 0 expected = df.iloc[:,2:3] result = df.iloc[:,2:3] assert_frame_equal(result, expected)
def test_indexing_mixed_frame_bug(self): # GH3492 df = DataFrame({"a": {1: "aaa", 2: "bbb", 3: "ccc"}, "b": {1: 111, 2: 222, 3: 333}}) # this works, new column is created correctly df["test"] = df["a"].apply(lambda x: "_" if x == "aaa" else x) # this does not work, ie column test is not changed idx = df["test"] == "_" temp = df.ix[idx, "a"].apply(lambda x: "-----" if x == "aaa" else x) df.ix[idx, "test"] = temp self.assert_(df.iloc[0, 2] == "-----")
def test_indexing_mixed_frame_bug(self): # GH3492 df=DataFrame({'a':{1:'aaa',2:'bbb',3:'ccc'},'b':{1:111,2:222,3:333}}) # this works, new column is created correctly df['test']=df['a'].apply(lambda x: '_' if x=='aaa' else x) # this does not work, ie column test is not changed idx=df['test']=='_' temp=df.ix[idx,'a'].apply(lambda x: '-----' if x=='aaa' else x) df.ix[idx,'test']=temp self.assert_(df.iloc[0,2] == '-----')
def predict(self, beta=None, x=None, fill_value=None, fill_method=None, axis=0): """ Parameters ---------- beta : Series x : Series or DataFrame fill_value : scalar or dict, default None fill_method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None axis : {0, 1}, default 0 See DataFrame.fillna for more details Notes ----- 1. If both fill_value and fill_method are None then NaNs are dropped (this is the default behavior) 2. An intercept will be automatically added to the new_y_values if the model was fitted using an intercept Returns ------- Series of predicted values """ if beta is None and x is None: return self.y_predict if beta is None: beta = self.beta else: beta = beta.reindex(self.beta.index) if isnull(beta).any(): raise ValueError('Must supply betas for same variables') if x is None: x = self._x orig_x = x else: orig_x = x if fill_value is None and fill_method is None: x = x.dropna(how='any') else: x = x.fillna(value=fill_value, method=fill_method, axis=axis) if isinstance(x, Series): x = DataFrame({'x': x}) if self._intercept: x['intercept'] = 1. x = x.reindex(columns=self._x.columns) rs = np.dot(x.values, beta.values) return Series(rs, x.index).reindex(orig_x.index)
def test_filter_against_workaround(self): np.random.seed(0) # Series of ints s = Series(np.random.randint(0, 100, 1000)) grouper = s.apply(lambda x: np.round(x, -1)) grouped = s.groupby(grouper) f = lambda x: x.mean() > 10 old_way = s[grouped.transform(f).astype('bool')] new_way = grouped.filter(f) assert_series_equal(new_way.sort_values(), old_way.sort_values()) # Series of floats s = 100 * Series(np.random.random(1000)) grouper = s.apply(lambda x: np.round(x, -1)) grouped = s.groupby(grouper) f = lambda x: x.mean() > 10 old_way = s[grouped.transform(f).astype('bool')] new_way = grouped.filter(f) assert_series_equal(new_way.sort_values(), old_way.sort_values()) # Set up DataFrame of ints, floats, strings. from string import ascii_lowercase letters = np.array(list(ascii_lowercase)) N = 1000 random_letters = letters.take(np.random.randint(0, 26, N)) df = DataFrame({'ints': Series(np.random.randint(0, 100, N)), 'floats': N / 10 * Series(np.random.random(N)), 'letters': Series(random_letters)}) # Group by ints; filter on floats. grouped = df.groupby('ints') old_way = df[grouped.floats. transform(lambda x: x.mean() > N / 20).astype('bool')] new_way = grouped.filter(lambda x: x['floats'].mean() > N / 20) assert_frame_equal(new_way, old_way) # Group by floats (rounded); filter on strings. grouper = df.floats.apply(lambda x: np.round(x, -1)) grouped = df.groupby(grouper) old_way = df[grouped.letters. transform(lambda x: len(x) < N / 10).astype('bool')] new_way = grouped.filter(lambda x: len(x.letters) < N / 10) assert_frame_equal(new_way, old_way) # Group by strings; filter on ints. grouped = df.groupby('letters') old_way = df[grouped.ints. transform(lambda x: x.mean() > N / 20).astype('bool')] new_way = grouped.filter(lambda x: x['ints'].mean() > N / 20) assert_frame_equal(new_way, old_way)
def test_iloc_multiindex(self): df = DataFrame(np.random.randn(3, 3), columns=[[2, 2, 4], [6, 8, 10]], index=[[4, 4, 8], [8, 10, 12]]) rs = df.iloc[2] xp = df.irow(2) assert_series_equal(rs, xp) rs = df.iloc[:, 2] xp = df.icol(2) assert_series_equal(rs, xp) rs = df.iloc[2, 2] xp = df.values[2, 2] self.assert_(rs == xp)
def test_agg_period_index(self): from pandas import period_range, PeriodIndex prng = period_range('2012-1-1', freq='M', periods=3) df = DataFrame(np.random.randn(3, 2), index=prng) rs = df.groupby(level=0).sum() tm.assertIsInstance(rs.index, PeriodIndex) # GH 3579 index = period_range(start='1999-01', periods=5, freq='M') s1 = Series(np.random.rand(len(index)), index=index) s2 = Series(np.random.rand(len(index)), index=index) series = [('s1', s1), ('s2', s2)] df = DataFrame.from_items(series) grouped = df.groupby(df.index.month) list(grouped)
def test_astype_assignment_with_iloc(self): # GH4312 df_orig = DataFrame([['1','2','3','.4',5,6.,'foo']],columns=list('ABCDEFG')) df = df_orig.copy() df.iloc[:,0:3] = df.iloc[:,0:3].astype(int) result = df.get_dtype_counts().sort_index() expected = Series({ 'int64' : 4, 'float64' : 1, 'object' : 2 }).sort_index() assert_series_equal(result,expected) df = df_orig.copy() df.iloc[:,0:3] = df.iloc[:,0:3].convert_objects(convert_numeric=True) result = df.get_dtype_counts().sort_index() expected = Series({ 'int64' : 4, 'float64' : 1, 'object' : 2 }).sort_index()
def percentileRank(frame, column=None, kind='mean'): """ Return score at percentile for each point in time (cross-section) Parameters ---------- frame: DataFrame column: string or Series, optional Column name or specific Series to compute percentiles for. If not provided, percentiles are computed for all values at each point in time. Note that this can take a LONG time. kind: {'rank', 'weak', 'strict', 'mean'}, optional This optional parameter specifies the interpretation of the resulting score: - "rank": Average percentage ranking of score. In case of multiple matches, average the percentage rankings of all matching scores. - "weak": This kind corresponds to the definition of a cumulative distribution function. A percentileofscore of 80% means that 80% of values are less than or equal to the provided score. - "strict": Similar to "weak", except that only values that are strictly less than the given score are counted. - "mean": The average of the "weak" and "strict" scores, often used in testing. See http://en.wikipedia.org/wiki/Percentile_rank Returns ------- TimeSeries or DataFrame, depending on input """ from pandas.compat.scipy import percentileofscore fun = lambda xs, score: percentileofscore(remove_na(xs), score, kind=kind) results = {} framet = frame.T if column is not None: if isinstance(column, Series): for date, xs in frame.T.items(): results[date] = fun(xs, column.get(date, NaN)) else: for date, xs in frame.T.items(): results[date] = fun(xs, xs[column]) results = Series(results) else: for column in frame.columns: for date, xs in framet.items(): results.setdefault(date, {})[column] = fun(xs, xs[column]) results = DataFrame(results).T return results
def _parse_data(schema, rows): # TODO(db) Is dtype_map important? Was previously used to build a numpy array that built the pandas df # # see: http://pandas.pydata.org/pandas-docs/dev/missing_data.html#missing-data-casting-rules-and-indexing # dtype_map = {'INTEGER': np.dtype(float), # 'FLOAT': np.dtype(float), # # This seems to be buggy without nanosecond indicator # 'TIMESTAMP': 'M8[ns]'} # col_dtypes = [dtype_map.get(field['type'], object) for field in schema['fields']] return DataFrame( OrderedDict([(field['name'], _parse_entry(field, row_cell)) for field, row_cell in zip(schema['fields'], row.get('f', []))]) for row in rows)
def _take_new_index(obj, indexer, new_index, axis=0): from pandas.core.api import Series, DataFrame if isinstance(obj, Series): new_values = algos.take_1d(obj.values, indexer) return Series(new_values, index=new_index, name=obj.name) elif isinstance(obj, DataFrame): if axis == 1: raise NotImplementedError("axis 1 is not supported") return DataFrame(obj._data.reindex_indexer( new_axis=new_index, indexer=indexer, axis=1)) else: raise ValueError("'obj' should be either a Series or a DataFrame")
def test_filter_and_transform_with_non_unique_string_index(self): # GH4620 index = list('bbbcbbab') df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3], 'tag': [23, 45, 62, 24, 45, 34, 25, 62]}, index=index) grouped_df = df.groupby('tag') ser = df['pid'] grouped_ser = ser.groupby(df['tag']) expected_indexes = [1, 2, 4, 7] # Filter DataFrame actual = grouped_df.filter(lambda x: len(x) > 1) expected = df.iloc[expected_indexes] assert_frame_equal(actual, expected) actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False) expected = df.copy() expected.iloc[[0, 3, 5, 6]] = np.nan assert_frame_equal(actual, expected) # Filter Series actual = grouped_ser.filter(lambda x: len(x) > 1) expected = ser.take(expected_indexes) assert_series_equal(actual, expected) actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False) NA = np.nan expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name='pid') # ^ made manually because this can get confusing! assert_series_equal(actual, expected) # Transform Series actual = grouped_ser.transform(len) expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name='pid') assert_series_equal(actual, expected) # Transform (a column from) DataFrameGroupBy actual = grouped_df.pid.transform(len) assert_series_equal(actual, expected)
def _take_new_index(obj, indexer, new_index, axis=0): from pandas.core.api import Series, DataFrame from pandas.core.internals import BlockManager if isinstance(obj, Series): new_values = com.take_1d(obj.values, indexer) return Series(new_values, index=new_index, name=obj.name) elif isinstance(obj, DataFrame): if axis == 1: raise NotImplementedError return DataFrame(obj._data.take(indexer, new_index=new_index, axis=1)) else: raise NotImplementedError
def test_agg_api(self): # Note: needs a very recent version of pandas to pass # TODO(holden): Pass this test if local fails # GH 6337 # http://stackoverflow.com/questions/21706030/pandas-groupby-agg-function-column-dtype-error # different api for agg when passed custom function with mixed frame df = DataFrame({'data1': np.random.randn(5), 'data2': np.random.randn(5), 'key1': ['a', 'a', 'b', 'b', 'a'], 'key2': ['one', 'two', 'one', 'two', 'one']}) ddf = self.psc.from_data_frame(df) dgrouped = ddf.groupby('key1') grouped = df.groupby('key1') def peak_to_peak(arr): return arr.max() - arr.min() expected = grouped.agg([peak_to_peak]) expected.columns = ['data1', 'data2'] result = dgrouped.agg(peak_to_peak).collect() assert_frame_equal(result, expected)
def createData2(self): y_data = [[1, np.NaN], [2, 3], [4, 5]] y_index = [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)] y_cols = ['A', 'B'] self.panel_y2 = DataFrame(np.array(y_data), index=y_index, columns=y_cols) x1_data = [[6, np.NaN], [7, 8], [9, 30], [11, 12]] x1_index = [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3), datetime(2000, 1, 4)] x1_cols = ['A', 'B'] x1 = DataFrame(np.array(x1_data), index=x1_index, columns=x1_cols) x2_data = [[13, 14, np.NaN], [15, np.NaN, np.NaN], [16, 17, 48], [19, 20, 21], [22, 23, 24]] x2_index = [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3), datetime(2000, 1, 4), datetime(2000, 1, 5)] x2_cols = ['C', 'A', 'B'] x2 = DataFrame(np.array(x2_data), index=x2_index, columns=x2_cols) self.panel_x2 = {'x1' : x1, 'x2' : x2}
def test_non_unique_loc(self): ## GH3659 ## non-unique indexer with loc slice ## https://groups.google.com/forum/?fromgroups#!topic/pydata/zTm2No0crYs # these are going to raise becuase the we are non monotonic df = DataFrame({'A' : [1,2,3,4,5,6], 'B' : [3,4,5,6,7,8]}, index = [0,1,0,1,2,3]) self.assertRaises(KeyError, df.loc.__getitem__, tuple([slice(1,None)])) self.assertRaises(KeyError, df.loc.__getitem__, tuple([slice(0,None)])) self.assertRaises(KeyError, df.loc.__getitem__, tuple([slice(1,2)])) # monotonic are ok df = DataFrame({'A' : [1,2,3,4,5,6], 'B' : [3,4,5,6,7,8]}, index = [0,1,0,1,2,3]).sort(axis=0) result = df.loc[1:] expected = DataFrame({'A' : [2,4,5,6], 'B' : [4, 6,7,8]}, index = [1,1,2,3]) assert_frame_equal(result,expected) result = df.loc[0:] assert_frame_equal(result,df) result = df.loc[1:2] expected = DataFrame({'A' : [2,4,5], 'B' : [4,6,7]}, index = [1,1,2]) assert_frame_equal(result,expected)
def query_results(self, jobs, results): """Parse query results and turn them into a NumPy DataFrame.""" # see: http://pandas.pydata.org/pandas-docs/dev/missing_data.html#missing-data-casting-rules-and-indexing dtype_map = { 'INTEGER': np.dtype(float), 'FLOAT': np.dtype(float), 'TIMESTAMP': 'M8[ns]' } # This seems to be buggy without # nanosecond indicator # This might take some time, so let the user know the query is done self.update_status("PROCESSING") self.job_complete() self.type = "query" self.total_rows = int(results['totalRows']) self.bytes_processed = int(results['totalBytesProcessed']) fields = results['schema']['fields'] col_types = [field['type'] for field in fields] col_names = [ field['name'].encode('ascii', 'ignore') for field in fields ] col_dtypes = [dtype_map.get(field['type'], object) for field in fields] row_array = np.zeros((self.total_rows, ), dtype=zip(col_names, col_dtypes)) row_num = 0 while 'rows' in results and row_num < self.total_rows: for row in results['rows']: entries = row.get('f', []) for col_num, field_type in enumerate(col_types): field_value = BigQueryResult._parse_entry( entries[col_num].get('v', ''), field_type) row_array[row_num][col_num] = field_value row_num += 1 page_token = results.get('pageToken', None) results = jobs.getQueryResults( projectId=secrets.BIGQUERY_PROJECT_ID, jobId=self.job_id, pageToken=page_token).execute() self.rows = DataFrame(row_array) self.animate()
def test_new_in0140(self): """ Test new functionality in 0.14.0. This currently doesn't work. """ # v0.14.0 whatsnew df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B']) ddf = self.psc.from_data_frame(df) g = ddf.groupby('A') result = g.first().collect() expected = df.iloc[[1, 2]].set_index('A') assert_frame_equal(result, expected) expected = df.iloc[[1, 2]].set_index('A') result = g.nth(0, dropna='any').collect() assert_frame_equal(result, expected)
def test_indexing_mixed_frame_bug(self): # GH3492 df = DataFrame({ 'a': { 1: 'aaa', 2: 'bbb', 3: 'ccc' }, 'b': { 1: 111, 2: 222, 3: 333 } }) # this works, new column is created correctly df['test'] = df['a'].apply(lambda x: '_' if x == 'aaa' else x) # this does not work, ie column test is not changed idx = df['test'] == '_' temp = df.ix[idx, 'a'].apply(lambda x: '-----' if x == 'aaa' else x) df.ix[idx, 'test'] = temp self.assert_(df.iloc[0, 2] == '-----')
def test_onecolumn_of_integer(self): ''' GH 3628 a column_of_integers dataframe should transfer well to sql ''' mono_df=DataFrame([1 , 2], columns=['c0']) sql.write_frame(mono_df, con = self.db, name = 'mono_df') # computing the sum via sql con_x=self.db the_sum=sum([my_c0[0] for my_c0 in con_x.execute("select * from mono_df")]) # it should not fail, and gives 3 ( Issue #3628 ) self.assertEqual(the_sum , 3) result = sql.read_frame("select * from mono_df",con_x) tm.assert_frame_equal(result,mono_df)
def test_agg_compat(self): # GH 12334 df = DataFrame({ 'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], 'B': ['one', 'one', 'two', 'two', 'two', 'two', 'one', 'two'], 'C': np.random.randn(8) + 1.0, 'D': np.arange(8) }) g = df.groupby(['A', 'B']) expected = pd.concat([g['D'].sum(), g['D'].std()], axis=1) expected.columns = MultiIndex.from_tuples([('C', 'sum'), ('C', 'std')]) result = g['D'].agg({'C': ['sum', 'std']}) assert_frame_equal(result, expected, check_like=True) expected = pd.concat([g['D'].sum(), g['D'].std()], axis=1) expected.columns = ['C', 'D'] result = g['D'].agg({'C': 'sum', 'D': 'std'}) assert_frame_equal(result, expected, check_like=True)
def test_level_groupby_get_group(self): # GH15155 df = DataFrame(data=np.arange(2, 22, 2), index=MultiIndex( levels=[pd.CategoricalIndex(["a", "b"]), range(10)], labels=[[0] * 5 + [1] * 5, range(10)], names=["Index1", "Index2"])) g = df.groupby(level=["Index1"]) # expected should equal test.loc[["a"]] # GH15166 expected = DataFrame(data=np.arange(2, 12, 2), index=pd.MultiIndex(levels=[ pd.CategoricalIndex(["a", "b"]), range(5) ], labels=[[0] * 5, range(5)], names=["Index1", "Index2"])) result = g.get_group('a') assert_frame_equal(result, expected)
def test_ix_general(self): # ix general issues # GH 2817 data = { 'amount': { 0: 700, 1: 600, 2: 222, 3: 333, 4: 444 }, 'col': { 0: 3.5, 1: 3.5, 2: 4.0, 3: 4.0, 4: 4.0 }, 'year': { 0: 2012, 1: 2011, 2: 2012, 3: 2012, 4: 2012 } } df = DataFrame(data).set_index(keys=['col', 'year']) # this should raise correct error self.assertRaises(KeyError, df.ix.__getitem__, tuple([4.0, 2012])) # this is ok df.sortlevel(inplace=True) df.ix[(4.0, 2012)]
def setUp(self): import warnings warnings.filterwarnings(action='ignore', category=FutureWarning) self.series_ints = Series(np.random.rand(4), index=range(0,8,2)) self.frame_ints = DataFrame(np.random.randn(4, 4), index=range(0, 8, 2), columns=range(0,12,3)) self.panel_ints = Panel(np.random.rand(4,4,4), items=range(0,8,2),major_axis=range(0,12,3),minor_axis=range(0,16,4)) self.series_labels = Series(np.random.randn(4), index=list('abcd')) self.frame_labels = DataFrame(np.random.randn(4, 4), index=list('abcd'), columns=list('ABCD')) self.panel_labels = Panel(np.random.randn(4,4,4), items=list('abcd'), major_axis=list('ABCD'), minor_axis=list('ZYXW')) self.series_mixed = Series(np.random.randn(4), index=[2, 4, 'null', 8]) self.frame_mixed = DataFrame(np.random.randn(4, 4), index=[2, 4, 'null', 8]) self.panel_mixed = Panel(np.random.randn(4,4,4), items=[2,4,'null',8]) self.series_ts = Series(np.random.randn(4), index=date_range('20130101', periods=4)) self.frame_ts = DataFrame(np.random.randn(4, 4), index=date_range('20130101', periods=4)) self.panel_ts = Panel(np.random.randn(4, 4, 4), items=date_range('20130101', periods=4)) #self.series_floats = Series(np.random.randn(4), index=[1.00, 2.00, 3.00, 4.00]) #self.frame_floats = DataFrame(np.random.randn(4, 4), columns=[1.00, 2.00, 3.00, 4.00]) #self.panel_floats = Panel(np.random.rand(4,4,4), items = [1.00,2.00,3.00,4.00]) self.frame_empty = DataFrame({}) self.series_empty = Series({}) self.panel_empty = Panel({}) # form agglomerates for o in self._objs: d = dict() for t in self._typs: d[t] = getattr(self,'%s_%s' % (o,t),None) setattr(self,o,d)
def test_bool_ops_warn_on_arithmetic(self): n = 10 df = DataFrame({ 'a': np.random.rand(n) > 0.5, 'b': np.random.rand(n) > 0.5 }) names = 'add', 'mul', 'sub' ops = '+', '*', '-' subs = {'+': '|', '*': '&', '-': '^'} sub_funcs = {'|': 'or_', '&': 'and_', '^': 'xor'} for op, name in zip(ops, names): f = getattr(operator, name) fe = getattr(operator, sub_funcs[subs[op]]) if op == '-': # raises TypeError continue with tm.use_numexpr(True, min_elements=5): with tm.assert_produces_warning(check_stacklevel=False): r = f(df, df) e = fe(df, df) tm.assert_frame_equal(r, e) with tm.assert_produces_warning(check_stacklevel=False): r = f(df.a, df.b) e = fe(df.a, df.b) tm.assert_series_equal(r, e) with tm.assert_produces_warning(check_stacklevel=False): r = f(df.a, True) e = fe(df.a, True) tm.assert_series_equal(r, e) with tm.assert_produces_warning(check_stacklevel=False): r = f(False, df.a) e = fe(False, df.a) tm.assert_series_equal(r, e) with tm.assert_produces_warning(check_stacklevel=False): r = f(False, df) e = fe(False, df) tm.assert_frame_equal(r, e) with tm.assert_produces_warning(check_stacklevel=False): r = f(df, True) e = fe(df, True) tm.assert_frame_equal(r, e)
def _filter_data(lhs, rhs, weights=None): """ Cleans the input for single OLS. Parameters ---------- lhs : Series Dependent variable in the regression. rhs : dict, whose values are Series, DataFrame, or dict Explanatory variables of the regression. weights : array-like, optional 1d array of weights. If None, equivalent to an unweighted OLS. Returns ------- Series, DataFrame Cleaned lhs and rhs """ if not isinstance(lhs, Series): if len(lhs) != len(rhs): raise AssertionError("length of lhs must equal length of rhs") lhs = Series(lhs, index=rhs.index) rhs = _combine_rhs(rhs) lhs = DataFrame({'__y__': lhs}, dtype=float) pre_filt_rhs = rhs.dropna(how='any') combined = rhs.join(lhs, how='outer') if weights is not None: combined['__weights__'] = weights valid = (combined.count(1) == len(combined.columns)).values index = combined.index combined = combined[valid] if weights is not None: filt_weights = combined.pop('__weights__') else: filt_weights = None filt_lhs = combined.pop('__y__') filt_rhs = combined if hasattr(filt_weights, 'to_dense'): filt_weights = filt_weights.to_dense() return (filt_lhs.to_dense(), filt_rhs.to_dense(), filt_weights, pre_filt_rhs.to_dense(), index, valid)
def test_bool_ops_warn_on_arithmetic(self): n = 10 df = DataFrame({ "a": np.random.rand(n) > 0.5, "b": np.random.rand(n) > 0.5 }) names = "add", "mul", "sub" ops = "+", "*", "-" subs = {"+": "|", "*": "&", "-": "^"} sub_funcs = {"|": "or_", "&": "and_", "^": "xor"} for op, name in zip(ops, names): f = getattr(operator, name) fe = getattr(operator, sub_funcs[subs[op]]) if op == "-": # raises TypeError continue with tm.use_numexpr(True, min_elements=5): with tm.assert_produces_warning(check_stacklevel=False): r = f(df, df) e = fe(df, df) tm.assert_frame_equal(r, e) with tm.assert_produces_warning(check_stacklevel=False): r = f(df.a, df.b) e = fe(df.a, df.b) tm.assert_series_equal(r, e) with tm.assert_produces_warning(check_stacklevel=False): r = f(df.a, True) e = fe(df.a, True) tm.assert_series_equal(r, e) with tm.assert_produces_warning(check_stacklevel=False): r = f(False, df.a) e = fe(False, df.a) tm.assert_series_equal(r, e) with tm.assert_produces_warning(check_stacklevel=False): r = f(False, df) e = fe(False, df) tm.assert_frame_equal(r, e) with tm.assert_produces_warning(check_stacklevel=False): r = f(df, True) e = fe(df, True) tm.assert_frame_equal(r, e)
def setUp(self): self.ts = tm.makeTimeSeries() self.seriesd = tm.getSeriesData() self.tsd = tm.getTimeSeriesData() self.frame = DataFrame(self.seriesd) self.tsframe = DataFrame(self.tsd) self.df = DataFrame({ 'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], 'C': np.random.randn(8), 'D': np.random.randn(8) }) self.df_mixed_floats = DataFrame({ 'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], 'C': np.random.randn(8), 'D': np.array(np.random.randn(8), dtype='float32') }) index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', 'three']], labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=['first', 'second']) self.mframe = DataFrame(np.random.randn(10, 3), index=index, columns=['A', 'B', 'C']) self.three_group = DataFrame({ 'A': [ 'foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar', 'foo', 'foo', 'foo' ], 'B': [ 'one', 'one', 'one', 'two', 'one', 'one', 'one', 'two', 'two', 'two', 'one' ], 'C': [ 'dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny', 'dull', 'shiny', 'shiny', 'shiny' ], 'D': np.random.randn(11), 'E': np.random.randn(11), 'F': np.random.randn(11) })
def read_sql(self, sql, index_col=None, coerce_float=True, params=None, parse_dates=None): args = _convert_params(sql, params) cursor = self.execute(*args) columns = [col_desc[0] for col_desc in cursor.description] data = self._fetchall_as_list(cursor) cursor.close() data_frame = DataFrame.from_records( data, columns=columns, coerce_float=coerce_float) _parse_date_columns(data_frame, parse_dates) if index_col is not None: data_frame.set_index(index_col, inplace=True) return data_frame
def setUp(self): arr = randn(N) arr[self._nan_locs] = np.NaN self.arr = arr self.rng = DateRange(datetime(2009, 1, 1), periods=N) self.series = Series(arr.copy(), index=self.rng) self.frame = DataFrame(randn(N, K), index=self.rng, columns=np.arange(K)) self.matrix = DataMatrix(randn(N, K), index=self.rng, columns=np.arange(K))
def test_bool_ops_warn_on_arithmetic(self, op_str, opname): n = 10 df = DataFrame({ "a": np.random.rand(n) > 0.5, "b": np.random.rand(n) > 0.5 }) subs = {"+": "|", "*": "&", "-": "^"} sub_funcs = {"|": "or_", "&": "and_", "^": "xor"} f = getattr(operator, opname) fe = getattr(operator, sub_funcs[subs[op_str]]) if op_str == "-": # raises TypeError return with tm.use_numexpr(True, min_elements=5): with tm.assert_produces_warning(check_stacklevel=False): r = f(df, df) e = fe(df, df) tm.assert_frame_equal(r, e) with tm.assert_produces_warning(check_stacklevel=False): r = f(df.a, df.b) e = fe(df.a, df.b) tm.assert_series_equal(r, e) with tm.assert_produces_warning(check_stacklevel=False): r = f(df.a, True) e = fe(df.a, True) tm.assert_series_equal(r, e) with tm.assert_produces_warning(check_stacklevel=False): r = f(False, df.a) e = fe(False, df.a) tm.assert_series_equal(r, e) with tm.assert_produces_warning(check_stacklevel=False): r = f(False, df) e = fe(False, df) tm.assert_frame_equal(r, e) with tm.assert_produces_warning(check_stacklevel=False): r = f(df, True) e = fe(df, True) tm.assert_frame_equal(r, e)
def read_sql(self, sql, index_col=None, coerce_float=True, parse_dates=None, params=None): args = _convert_params(sql, params) result = self.execute(*args) data = result.fetchall() columns = result.keys() data_frame = DataFrame.from_records( data, columns=columns, coerce_float=coerce_float) _parse_date_columns(data_frame, parse_dates) if index_col is not None: data_frame.set_index(index_col, inplace=True) return data_frame
def _flex_binary_moment(arg1, arg2, f, pairwise=False): if not (isinstance(arg1, (np.ndarray, Series, DataFrame)) and isinstance(arg2, (np.ndarray, Series, DataFrame))): raise TypeError("arguments to moment function must be of type " "np.ndarray/Series/DataFrame") if isinstance(arg1, (np.ndarray, Series)) and \ isinstance(arg2, (np.ndarray,Series)): X, Y = _prep_binary(arg1, arg2) return f(X, Y) elif isinstance(arg1, DataFrame): results = {} if isinstance(arg2, DataFrame): X, Y = arg1.align(arg2, join='outer') if pairwise is False: X = X + 0 * Y Y = Y + 0 * X res_columns = arg1.columns.union(arg2.columns) for col in res_columns: if col in X and col in Y: results[col] = f(X[col], Y[col]) elif pairwise is True: results = defaultdict(dict) for i, k1 in enumerate(arg1.columns): for j, k2 in enumerate(arg2.columns): if j < i and arg2 is arg1: # Symmetric case results[k1][k2] = results[k2][k1] else: results[k1][k2] = f( *_prep_binary(arg1[k1], arg2[k2])) return Panel.from_dict(results).swapaxes('items', 'major') else: raise ValueError("'pairwise' is not True/False") else: res_columns = arg1.columns X, Y = arg1.align(arg2, axis=0, join='outer') results = {} for col in res_columns: results[col] = f(X[col], Y) return DataFrame(results, index=X.index, columns=res_columns) else: return _flex_binary_moment(arg2, arg1, f)
def test_bool_ops_raise_on_arithmetic(self): df = DataFrame({ 'a': np.random.rand(10) > 0.5, 'b': np.random.rand(10) > 0.5 }) names = 'add', 'mul', 'sub', 'div', 'truediv', 'floordiv', 'pow' ops = '+', '*', '-', '/', '/', '//', '**' msg = 'operator %r not implemented for bool dtypes' for op, name in zip(ops, names): if not compat.PY3 or name != 'div': f = getattr(operator, name) err_msg = re.escape(msg % op) with tm.assertRaisesRegexp(NotImplementedError, err_msg): f(df, df) with tm.assertRaisesRegexp(NotImplementedError, err_msg): f(df.a, df.b)
def _check(method): series = self.frame[1] res = method(series, self.frame, 10) res2 = method(self.frame, series, 10) exp = self.frame.apply(lambda x: method(series, x, 10)) tm.assert_frame_equal(res, exp) tm.assert_frame_equal(res2, exp) frame2 = self.frame.copy() frame2.values[:] = np.random.randn(*frame2.shape) res3 = method(self.frame, frame2, 10) exp = DataFrame( dict((k, method(self.frame[k], frame2[k], 10)) for k in self.frame)) tm.assert_frame_equal(res3, exp)
def _take_new_index(obj, indexer, new_index, axis=0): from pandas.core.api import Series, DataFrame from pandas.core.internals import BlockManager if isinstance(obj, Series): new_values = com.take_1d(obj.values, indexer) return Series(new_values, index=new_index, name=obj.name) elif isinstance(obj, DataFrame): if axis == 1: raise NotImplementedError data = obj._data new_blocks = [b.take(indexer, axis=1) for b in data.blocks] new_axes = list(data.axes) new_axes[1] = new_index new_data = BlockManager(new_blocks, new_axes) return DataFrame(new_data) else: raise NotImplementedError
def _rollingMoment(arg, window, func, minp, time_rule=None): """ Rolling statistical measure using supplied function. Designed to be used with passed-in Cython array-based functions. Parameters ---------- arg : DataFrame or numpy ndarray-like window : Number of observations used for calculating statistic func : Cython function to compute rolling statistic on raw series minp : int Minimum number of observations required to have a value """ types = (DataFrame, DataMatrix, Series) if time_rule is not None and isinstance(arg, types): # Conform to whatever frequency needed. arg = arg.asfreq(time_rule) if isinstance(arg, DataMatrix): T, N = arg.values.shape resultMatrix = np.empty((T, N), dtype=arg.values.dtype) arg.values[np.isinf(arg.values)] = NaN for i in range(N): resultMatrix[:, i] = func(arg.values[:, i], window, minp=minp) output = DataMatrix(resultMatrix, index=arg.index, columns=arg.columns) elif isinstance(arg, DataFrame): output = DataFrame(index=arg.index) for col, series in arg.iteritems(): series[np.isinf(series)] = NaN output[col] = Series(func(series, window, minp=minp), index=series.index) elif isinstance(arg, Series): arg[np.isinf(arg)] = NaN output = Series(func(arg, window, minp=minp), index=arg.index) else: try: assert (hasattr(arg, '__iter__')) except AssertionError: raise AssertionError('Expected DataFrame or array-like argument') arg[np.isinf(arg)] = NaN output = func(arg, window, minp=minp) return output
def test_xs_multiindex(self): # GH2903 columns = MultiIndex.from_tuples([('a', 'foo'), ('a', 'bar'), ('b', 'hello'), ('b', 'world')], names=['lvl0', 'lvl1']) df = DataFrame(np.random.randn(4, 4), columns=columns) df.sortlevel(axis=1,inplace=True) result = df.xs('a', level='lvl0', axis=1) expected = df.iloc[:,0:2].loc[:,'a'] assert_frame_equal(result,expected) result = df.xs('foo', level='lvl1', axis=1) expected = df.iloc[:, 1:2].copy() expected.columns = expected.columns.droplevel('lvl1') assert_frame_equal(result, expected)