def test_pivot_table(self): index = ['A', 'B'] columns = 'C' table = pivot_table(self.data, values='D', index=index, columns=columns) table2 = self.data.pivot_table(values='D', index=index, columns=columns) tm.assert_frame_equal(table, table2) # this works pivot_table(self.data, values='D', index=index) if len(index) > 1: self.assertEqual(table.index.names, tuple(index)) else: self.assertEqual(table.index.name, index[0]) if len(columns) > 1: self.assertEqual(table.columns.names, columns) else: self.assertEqual(table.columns.name, columns[0]) expected = self.data.groupby(index + [columns])['D'].agg( np.mean).unstack() tm.assert_frame_equal(table, expected)
def test_pivot_datetime_tz(self): dates1 = ['2011-07-19 07:00:00', '2011-07-19 08:00:00', '2011-07-19 09:00:00', '2011-07-19 07:00:00', '2011-07-19 08:00:00', '2011-07-19 09:00:00'] dates2 = ['2013-01-01 15:00:00', '2013-01-01 15:00:00', '2013-01-01 15:00:00', '2013-02-01 15:00:00', '2013-02-01 15:00:00', '2013-02-01 15:00:00'] df = DataFrame({'label': ['a', 'a', 'a', 'b', 'b', 'b'], 'dt1': dates1, 'dt2': dates2, 'value1': range(6), 'value2': [1, 2] * 3}) df['dt1'] = df['dt1'].apply(lambda d: pd.Timestamp(d, tz='US/Pacific')) df['dt2'] = df['dt2'].apply(lambda d: pd.Timestamp(d, tz='Asia/Tokyo')) exp_idx = pd.DatetimeIndex(['2011-07-19 07:00:00', '2011-07-19 08:00:00', '2011-07-19 09:00:00'], tz='US/Pacific', name='dt1') exp_col1 = Index(['value1', 'value1']) exp_col2 = Index(['a', 'b'], name='label') exp_col = MultiIndex.from_arrays([exp_col1, exp_col2]) expected = DataFrame([[0, 3], [1, 4], [2, 5]], index=exp_idx, columns=exp_col) result = pivot_table(df, index=['dt1'], columns=['label'], values=['value1']) tm.assert_frame_equal(result, expected) exp_col1 = Index(['sum', 'sum', 'sum', 'sum', 'mean', 'mean', 'mean', 'mean']) exp_col2 = Index(['value1', 'value1', 'value2', 'value2'] * 2) exp_col3 = pd.DatetimeIndex(['2013-01-01 15:00:00', '2013-02-01 15:00:00'] * 4, tz='Asia/Tokyo', name='dt2') exp_col = MultiIndex.from_arrays([exp_col1, exp_col2, exp_col3]) expected = DataFrame(np.array([[0, 3, 1, 2, 0, 3, 1, 2], [1, 4, 2, 1, 1, 4, 2, 1], [2, 5, 1, 2, 2, 5, 1, 2]]), index=exp_idx, columns=exp_col) result = pivot_table(df, index=['dt1'], columns=['dt2'], values=['value1', 'value2'], aggfunc=[np.sum, np.mean]) tm.assert_frame_equal(result, expected)
def test_pivot_table(self): index = ['A', 'B'] columns = 'C' table = pivot_table(self.data, values='D', index=index, columns=columns) table2 = self.data.pivot_table( values='D', index=index, columns=columns) tm.assert_frame_equal(table, table2) # this works pivot_table(self.data, values='D', index=index) if len(index) > 1: self.assertEqual(table.index.names, tuple(index)) else: self.assertEqual(table.index.name, index[0]) if len(columns) > 1: self.assertEqual(table.columns.names, columns) else: self.assertEqual(table.columns.name, columns[0]) expected = self.data.groupby( index + [columns])['D'].agg(np.mean).unstack() tm.assert_frame_equal(table, expected)
def test_pivot_datetime_tz(self): dates1 = [ '2011-07-19 07:00:00', '2011-07-19 08:00:00', '2011-07-19 09:00:00', '2011-07-19 07:00:00', '2011-07-19 08:00:00', '2011-07-19 09:00:00' ] dates2 = [ '2013-01-01 15:00:00', '2013-01-01 15:00:00', '2013-01-01 15:00:00', '2013-02-01 15:00:00', '2013-02-01 15:00:00', '2013-02-01 15:00:00' ] df = DataFrame({ 'label': ['a', 'a', 'a', 'b', 'b', 'b'], 'dt1': dates1, 'dt2': dates2, 'value1': np.arange(6, dtype='int64'), 'value2': [1, 2] * 3 }) df['dt1'] = df['dt1'].apply(lambda d: pd.Timestamp(d, tz='US/Pacific')) df['dt2'] = df['dt2'].apply(lambda d: pd.Timestamp(d, tz='Asia/Tokyo')) exp_idx = pd.DatetimeIndex([ '2011-07-19 07:00:00', '2011-07-19 08:00:00', '2011-07-19 09:00:00' ], tz='US/Pacific', name='dt1') exp_col1 = Index(['value1', 'value1']) exp_col2 = Index(['a', 'b'], name='label') exp_col = MultiIndex.from_arrays([exp_col1, exp_col2]) expected = DataFrame([[0, 3], [1, 4], [2, 5]], index=exp_idx, columns=exp_col) result = pivot_table(df, index=['dt1'], columns=['label'], values=['value1']) tm.assert_frame_equal(result, expected) exp_col1 = Index( ['sum', 'sum', 'sum', 'sum', 'mean', 'mean', 'mean', 'mean']) exp_col2 = Index(['value1', 'value1', 'value2', 'value2'] * 2) exp_col3 = pd.DatetimeIndex( ['2013-01-01 15:00:00', '2013-02-01 15:00:00'] * 4, tz='Asia/Tokyo', name='dt2') exp_col = MultiIndex.from_arrays([exp_col1, exp_col2, exp_col3]) expected = DataFrame(np.array( [[0, 3, 1, 2, 0, 3, 1, 2], [1, 4, 2, 1, 1, 4, 2, 1], [2, 5, 1, 2, 2, 5, 1, 2]], dtype='int64'), index=exp_idx, columns=exp_col) result = pivot_table(df, index=['dt1'], columns=['dt2'], values=['value1', 'value2'], aggfunc=[np.sum, np.mean]) tm.assert_frame_equal(result, expected)
def test_pivot_dtaccessor(self): # GH 8103 dates1 = ['2011-07-19 07:00:00', '2011-07-19 08:00:00', '2011-07-19 09:00:00', '2011-07-19 07:00:00', '2011-07-19 08:00:00', '2011-07-19 09:00:00'] dates2 = ['2013-01-01 15:00:00', '2013-01-01 15:00:00', '2013-01-01 15:00:00', '2013-02-01 15:00:00', '2013-02-01 15:00:00', '2013-02-01 15:00:00'] df = DataFrame({'label': ['a', 'a', 'a', 'b', 'b', 'b'], 'dt1': dates1, 'dt2': dates2, 'value1': np.arange(6, dtype='int64'), 'value2': [1, 2] * 3}) df['dt1'] = df['dt1'].apply(lambda d: pd.Timestamp(d)) df['dt2'] = df['dt2'].apply(lambda d: pd.Timestamp(d)) result = pivot_table(df, index='label', columns=df['dt1'].dt.hour, values='value1') exp_idx = Index(['a', 'b'], name='label') expected = DataFrame({7: [0, 3], 8: [1, 4], 9: [2, 5]}, index=exp_idx, columns=Index([7, 8, 9], name='dt1')) tm.assert_frame_equal(result, expected) result = pivot_table(df, index=df['dt2'].dt.month, columns=df['dt1'].dt.hour, values='value1') expected = DataFrame({7: [0, 3], 8: [1, 4], 9: [2, 5]}, index=Index([1, 2], name='dt2'), columns=Index([7, 8, 9], name='dt1')) tm.assert_frame_equal(result, expected) result = pivot_table(df, index=df['dt2'].dt.year.values, columns=[df['dt1'].dt.hour, df['dt2'].dt.month], values='value1') exp_col = MultiIndex.from_arrays( [[7, 7, 8, 8, 9, 9], [1, 2] * 3], names=['dt1', 'dt2']) expected = DataFrame(np.array([[0, 3, 1, 4, 2, 5]], dtype='int64'), index=[2013], columns=exp_col) tm.assert_frame_equal(result, expected) result = pivot_table(df, index=np.array(['X', 'X', 'X', 'X', 'Y', 'Y']), columns=[df['dt1'].dt.hour, df['dt2'].dt.month], values='value1') expected = DataFrame(np.array([[0, 3, 1, np.nan, 2, np.nan], [np.nan, np.nan, np.nan, 4, np.nan, 5]]), index=['X', 'Y'], columns=exp_col) tm.assert_frame_equal(result, expected)
def test_pivot_multi_values(self): result = pivot_table(self.data, values=['D', 'E'], index='A', columns=['B', 'C'], fill_value=0) expected = pivot_table(self.data.drop(['F'], axis=1), index='A', columns=['B', 'C'], fill_value=0) tm.assert_frame_equal(result, expected)
def test_pivot_table_warnings(self): index = ['A', 'B'] columns = 'C' with tm.assert_produces_warning(FutureWarning): table = pivot_table(self.data, values='D', rows=index, cols=columns) with tm.assert_produces_warning(False): table2 = pivot_table(self.data, values='D', index=index, columns=columns) tm.assert_frame_equal(table, table2)
def test_pivot_datetime_tz(self): dates1 = [ "2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00", "2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00", ] dates2 = [ "2013-01-01 15:00:00", "2013-01-01 15:00:00", "2013-01-01 15:00:00", "2013-02-01 15:00:00", "2013-02-01 15:00:00", "2013-02-01 15:00:00", ] df = DataFrame( { "label": ["a", "a", "a", "b", "b", "b"], "dt1": dates1, "dt2": dates2, "value1": np.arange(6, dtype="int64"), "value2": [1, 2] * 3, } ) df["dt1"] = df["dt1"].apply(lambda d: pd.Timestamp(d, tz="US/Pacific")) df["dt2"] = df["dt2"].apply(lambda d: pd.Timestamp(d, tz="Asia/Tokyo")) exp_idx = pd.DatetimeIndex( ["2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00"], tz="US/Pacific", name="dt1" ) exp_col1 = Index(["value1", "value1"]) exp_col2 = Index(["a", "b"], name="label") exp_col = MultiIndex.from_arrays([exp_col1, exp_col2]) expected = DataFrame([[0, 3], [1, 4], [2, 5]], index=exp_idx, columns=exp_col) result = pivot_table(df, index=["dt1"], columns=["label"], values=["value1"]) tm.assert_frame_equal(result, expected) exp_col1 = Index(["sum", "sum", "sum", "sum", "mean", "mean", "mean", "mean"]) exp_col2 = Index(["value1", "value1", "value2", "value2"] * 2) exp_col3 = pd.DatetimeIndex(["2013-01-01 15:00:00", "2013-02-01 15:00:00"] * 4, tz="Asia/Tokyo", name="dt2") exp_col = MultiIndex.from_arrays([exp_col1, exp_col2, exp_col3]) expected = DataFrame( np.array([[0, 3, 1, 2, 0, 3, 1, 2], [1, 4, 2, 1, 1, 4, 2, 1], [2, 5, 1, 2, 2, 5, 1, 2]], dtype="int64"), index=exp_idx, columns=exp_col, ) result = pivot_table(df, index=["dt1"], columns=["dt2"], values=["value1", "value2"], aggfunc=[np.sum, np.mean]) tm.assert_frame_equal(result, expected)
def test_pivot_multi_functions(self): f = lambda func: pivot_table(self.data, values=["D", "E"], index=["A", "B"], columns="C", aggfunc=func) result = f([np.mean, np.std]) means = f(np.mean) stds = f(np.std) expected = concat([means, stds], keys=["mean", "std"], axis=1) tm.assert_frame_equal(result, expected) # margins not supported?? f = lambda func: pivot_table( self.data, values=["D", "E"], index=["A", "B"], columns="C", aggfunc=func, margins=True ) result = f([np.mean, np.std]) means = f(np.mean) stds = f(np.std) expected = concat([means, stds], keys=["mean", "std"], axis=1) tm.assert_frame_equal(result, expected)
def frame(self, id, cursor=None, **kwargs): stacks = self.stack(id, cursor, **kwargs) logger.debug('Start Pivot stacks') t1 = Timer() stacks['value'] = True result = pivot_table(stacks, 'value', 'date', 'stock', fill_value=False) result.columns = numpy.char.mod('%06d', result.columns) logger.debug('Finished pivoting in %s', t1) return result
def test_pivot_multi_functions(self): f = lambda func: pivot_table(self.data, values=['D', 'E'], rows=['A', 'B'], cols='C', aggfunc=func) result = f([np.mean, np.std]) means = f(np.mean) stds = f(np.std) expected = concat([means, stds], keys=['mean', 'std'], axis=1) tm.assert_frame_equal(result, expected) # margins not supported?? f = lambda func: pivot_table(self.data, values=['D', 'E'], rows=['A', 'B'], cols='C', aggfunc=func, margins=True) result = f([np.mean, np.std]) means = f(np.mean) stds = f(np.std) expected = concat([means, stds], keys=['mean', 'std'], axis=1) tm.assert_frame_equal(result, expected)
def test_pivot_dtypes(self): # can convert dtypes f = DataFrame({'a' : ['cat', 'bat', 'cat', 'bat'], 'v' : [1,2,3,4], 'i' : ['a','b','a','b']}) self.assertEqual(f.dtypes['v'], 'int64') z = pivot_table(f, values='v', rows=['a'], cols=['i'], fill_value=0, aggfunc=np.sum) result = z.get_dtype_counts() expected = Series(dict(int64 = 2)) tm.assert_series_equal(result, expected) # cannot convert dtypes f = DataFrame({'a' : ['cat', 'bat', 'cat', 'bat'], 'v' : [1.5,2.5,3.5,4.5], 'i' : ['a','b','a','b']}) self.assertEqual(f.dtypes['v'], 'float64') z = pivot_table(f, values='v', rows=['a'], cols=['i'], fill_value=0, aggfunc=np.mean) result = z.get_dtype_counts() expected = Series(dict(float64 = 2)) tm.assert_series_equal(result, expected)
def test_pivot_dtaccessor(self): # GH 8103 dates1 = ['2011-07-19 07:00:00', '2011-07-19 08:00:00', '2011-07-19 09:00:00', '2011-07-19 07:00:00', '2011-07-19 08:00:00', '2011-07-19 09:00:00'] dates2 = ['2013-01-01 15:00:00', '2013-01-01 15:00:00', '2013-01-01 15:00:00', '2013-02-01 15:00:00', '2013-02-01 15:00:00', '2013-02-01 15:00:00'] df = DataFrame({'label': ['a', 'a', 'a', 'b', 'b', 'b'], 'dt1': dates1, 'dt2': dates2, 'value1': np.arange(6,dtype='int64'), 'value2': [1, 2] * 3}) df['dt1'] = df['dt1'].apply(lambda d: pd.Timestamp(d)) df['dt2'] = df['dt2'].apply(lambda d: pd.Timestamp(d)) result = pivot_table(df, index='label', columns=df['dt1'].dt.hour, values='value1') exp_idx = Index(['a', 'b'], name='label') expected = DataFrame({7: [0, 3], 8: [1, 4], 9:[2, 5]}, index=exp_idx, columns=Index([7, 8, 9],name='dt1')) tm.assert_frame_equal(result, expected) result = pivot_table(df, index=df['dt2'].dt.month, columns=df['dt1'].dt.hour, values='value1') expected = DataFrame({7: [0, 3], 8: [1, 4], 9:[2, 5]}, index=Index([1, 2],name='dt2'), columns=Index([7, 8, 9],name='dt1')) tm.assert_frame_equal(result, expected) result = pivot_table(df, index=df['dt2'].dt.year.values, columns=[df['dt1'].dt.hour, df['dt2'].dt.month], values='value1') exp_col = MultiIndex.from_arrays([[7, 7, 8, 8, 9, 9], [1, 2] * 3],names=['dt1','dt2']) expected = DataFrame(np.array([[0, 3, 1, 4, 2, 5]],dtype='int64'), index=[2013], columns=exp_col) tm.assert_frame_equal(result, expected) result = pivot_table(df, index=np.array(['X', 'X', 'X', 'X', 'Y', 'Y']), columns=[df['dt1'].dt.hour, df['dt2'].dt.month], values='value1') expected = DataFrame(np.array([[0, 3, 1, np.nan, 2, np.nan], [np.nan, np.nan, np.nan, 4, np.nan, 5]]), index=['X', 'Y'], columns=exp_col) tm.assert_frame_equal(result, expected)
def test_pivot_multi_functions(self): f = lambda func: pivot_table(self.data, values=['D', 'E'], index=['A', 'B'], columns='C', aggfunc=func) result = f([np.mean, np.std]) means = f(np.mean) stds = f(np.std) expected = concat([means, stds], keys=['mean', 'std'], axis=1) tm.assert_frame_equal(result, expected) # margins not supported?? f = lambda func: pivot_table(self.data, values=['D', 'E'], index=['A', 'B'], columns='C', aggfunc=func, margins=True) result = f([np.mean, np.std]) means = f(np.mean) stds = f(np.std) expected = concat([means, stds], keys=['mean', 'std'], axis=1) tm.assert_frame_equal(result, expected)
def test_pivot_dtypes(self): # can convert dtypes f = DataFrame({"a": ["cat", "bat", "cat", "bat"], "v": [1, 2, 3, 4], "i": ["a", "b", "a", "b"]}) self.assertEqual(f.dtypes["v"], "int64") z = pivot_table(f, values="v", index=["a"], columns=["i"], fill_value=0, aggfunc=np.sum) result = z.get_dtype_counts() expected = Series(dict(int64=2)) tm.assert_series_equal(result, expected) # cannot convert dtypes f = DataFrame({"a": ["cat", "bat", "cat", "bat"], "v": [1.5, 2.5, 3.5, 4.5], "i": ["a", "b", "a", "b"]}) self.assertEqual(f.dtypes["v"], "float64") z = pivot_table(f, values="v", index=["a"], columns=["i"], fill_value=0, aggfunc=np.mean) result = z.get_dtype_counts() expected = Series(dict(float64=2)) tm.assert_series_equal(result, expected)
def test_pivot_dtypes(self): # can convert dtypes f = DataFrame({'a' : ['cat', 'bat', 'cat', 'bat'], 'v' : [1,2,3,4], 'i' : ['a','b','a','b']}) self.assertEqual(f.dtypes['v'], 'int64') z = pivot_table(f, values='v', index=['a'], columns=['i'], fill_value=0, aggfunc=np.sum) result = z.get_dtype_counts() expected = Series(dict(int64 = 2)) tm.assert_series_equal(result, expected) # cannot convert dtypes f = DataFrame({'a' : ['cat', 'bat', 'cat', 'bat'], 'v' : [1.5,2.5,3.5,4.5], 'i' : ['a','b','a','b']}) self.assertEqual(f.dtypes['v'], 'float64') z = pivot_table(f, values='v', index=['a'], columns=['i'], fill_value=0, aggfunc=np.mean) result = z.get_dtype_counts() expected = Series(dict(float64 = 2)) tm.assert_series_equal(result, expected)
def test_pivot_table(self): rows = ['A', 'B'] cols= 'C' table = pivot_table(self.data, values='D', rows=rows, cols=cols) # this works pivot_table(self.data, values='D', rows=rows) if len(rows) > 1: self.assertEqual(table.index.names, rows) else: self.assertEqual(table.index.name, rows[0]) if len(cols) > 1: self.assertEqual(table.columns.names, cols) else: self.assertEqual(table.columns.name, cols[0]) expected = self.data.groupby(rows + [cols])['D'].agg(np.mean).unstack() assert_frame_equal(table, expected)
def test_pivot_table_with_margins_set_margin_name(self): # GH 3335 for margin_name in ['foo', 'one', 666, None, ['a', 'b']]: with self.assertRaises(ValueError): # multi-index index pivot_table(self.data, values='D', index=['A', 'B'], columns=['C'], margins=True, margins_name=margin_name) with self.assertRaises(ValueError): # multi-index column pivot_table(self.data, values='D', index=['C'], columns=['A', 'B'], margins=True, margins_name=margin_name) with self.assertRaises(ValueError): # non-multi-index index/column pivot_table(self.data, values='D', index=['A'], columns=['B'], margins=True, margins_name=margin_name)
def test_pivot_table(self): rows = ['A', 'B'] cols = 'C' table = pivot_table(self.data, values='D', rows=rows, cols=cols) table2 = self.data.pivot_table(values='D', rows=rows, cols=cols) tm.assert_frame_equal(table, table2) # this works pivot_table(self.data, values='D', rows=rows) if len(rows) > 1: self.assertEqual(table.index.names, rows) else: self.assertEqual(table.index.name, rows[0]) if len(cols) > 1: self.assertEqual(table.columns.names, cols) else: self.assertEqual(table.columns.name, cols[0]) expected = self.data.groupby(rows + [cols])['D'].agg(np.mean).unstack() tm.assert_frame_equal(table, expected)
def test_pivot_table(self): rows = ["A", "B"] cols = "C" table = pivot_table(self.data, values="D", rows=rows, cols=cols) table2 = self.data.pivot_table(values="D", rows=rows, cols=cols) tm.assert_frame_equal(table, table2) # this works pivot_table(self.data, values="D", rows=rows) if len(rows) > 1: self.assertEqual(table.index.names, rows) else: self.assertEqual(table.index.name, rows[0]) if len(cols) > 1: self.assertEqual(table.columns.names, cols) else: self.assertEqual(table.columns.name, cols[0]) expected = self.data.groupby(rows + [cols])["D"].agg(np.mean).unstack() tm.assert_frame_equal(table, expected)
def metric_vs_property( data, metrics=('completion_time',), property='agent_parameter', aggfunc=np.mean, ): table = pivot_table( df, values=list(metrics), rows=[property], #cols=[metric], aggfunc=np.mean ) return table
def add_indicator_data_to_rc_df(self,rc_df, i_id): ''' left join the region / campaign dataframe with the stored data for each campaign. ''' column_header = ['region_id','campaign_id'] column_header.append(i_id) indicator_df = DataFrame(list(DataPointComputed.objects.filter( indicator_id = i_id).values())) pivoted_indicator_df = pivot_table(indicator_df, values='value',\ columns=['indicator_id'],index = ['region_id','campaign_id']) cleaned_df = pivoted_indicator_df.reset_index(level=[0,1], inplace=False) merged_df = rc_df.merge(cleaned_df,how='left') merged_df = merged_df.reset_index(drop=True) return merged_df
def add_indicator_data_to_rc_df(self, rc_df, i_id): ''' left join the region / campaign dataframe with the stored data for each campaign. ''' column_header = ['region_id', 'campaign_id'] column_header.append(i_id) indicator_df = DataFrame( list(DataPointComputed.objects.filter(indicator_id=i_id).values())) pivoted_indicator_df = pivot_table(indicator_df, values='value',\ columns=['indicator_id'],index = ['region_id','campaign_id']) cleaned_df = pivoted_indicator_df.reset_index(level=[0, 1], inplace=False) merged_df = rc_df.merge(cleaned_df, how='left') merged_df = merged_df.reset_index(drop=True) return merged_df
def test_pivot_table_with_margins_set_margin_name(self): # GH 3335 for margin_name in ["foo", "one", 666, None, ["a", "b"]]: with self.assertRaises(ValueError): # multi-index index pivot_table( self.data, values="D", index=["A", "B"], columns=["C"], margins=True, margins_name=margin_name ) with self.assertRaises(ValueError): # multi-index column pivot_table( self.data, values="D", index=["C"], columns=["A", "B"], margins=True, margins_name=margin_name ) with self.assertRaises(ValueError): # non-multi-index index/column pivot_table(self.data, values="D", index=["A"], columns=["B"], margins=True, margins_name=margin_name)
def test_pivot_timegrouper(self): df = DataFrame({ 'Branch': 'A A A A A A A B'.split(), 'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(), 'Quantity': [1, 3, 5, 1, 8, 1, 9, 3], 'Date': [ datetime(2013, 1, 1), datetime(2013, 1, 1), datetime(2013, 10, 1), datetime(2013, 10, 2), datetime(2013, 10, 1), datetime(2013, 10, 2), datetime(2013, 12, 2), datetime(2013, 12, 2), ] }).set_index('Date') expected = DataFrame(np.array([10, 18, 3], dtype='int64').reshape(1, 3), index=[datetime(2013, 12, 31)], columns='Carl Joe Mark'.split()) expected.index.name = 'Date' expected.columns.name = 'Buyer' result = pivot_table(df, index=Grouper(freq='A'), columns='Buyer', values='Quantity', aggfunc=np.sum) tm.assert_frame_equal(result, expected) result = pivot_table(df, index='Buyer', columns=Grouper(freq='A'), values='Quantity', aggfunc=np.sum) tm.assert_frame_equal(result, expected.T) expected = DataFrame( np.array([1, np.nan, 3, 9, 18, np.nan]).reshape(2, 3), index=[datetime(2013, 1, 1), datetime(2013, 7, 1)], columns='Carl Joe Mark'.split()) expected.index.name = 'Date' expected.columns.name = 'Buyer' result = pivot_table(df, index=Grouper(freq='6MS'), columns='Buyer', values='Quantity', aggfunc=np.sum) tm.assert_frame_equal(result, expected) result = pivot_table(df, index='Buyer', columns=Grouper(freq='6MS'), values='Quantity', aggfunc=np.sum) tm.assert_frame_equal(result, expected.T) # passing the name df = df.reset_index() result = pivot_table(df, index=Grouper(freq='6MS', key='Date'), columns='Buyer', values='Quantity', aggfunc=np.sum) tm.assert_frame_equal(result, expected) result = pivot_table(df, index='Buyer', columns=Grouper(freq='6MS', key='Date'), values='Quantity', aggfunc=np.sum) tm.assert_frame_equal(result, expected.T) self.assertRaises( KeyError, lambda: pivot_table(df, index=Grouper(freq='6MS', key='foo'), columns='Buyer', values='Quantity', aggfunc=np.sum)) self.assertRaises( KeyError, lambda: pivot_table(df, index='Buyer', columns=Grouper(freq='6MS', key='foo'), values='Quantity', aggfunc=np.sum)) # passing the level df = df.set_index('Date') result = pivot_table(df, index=Grouper(freq='6MS', level='Date'), columns='Buyer', values='Quantity', aggfunc=np.sum) tm.assert_frame_equal(result, expected) result = pivot_table(df, index='Buyer', columns=Grouper(freq='6MS', level='Date'), values='Quantity', aggfunc=np.sum) tm.assert_frame_equal(result, expected.T) self.assertRaises( ValueError, lambda: pivot_table(df, index=Grouper(freq='6MS', level='foo'), columns='Buyer', values='Quantity', aggfunc=np.sum)) self.assertRaises( ValueError, lambda: pivot_table(df, index='Buyer', columns=Grouper(freq='6MS', level='foo'), values='Quantity', aggfunc=np.sum)) # double grouper df = DataFrame({ 'Branch': 'A A A A A A A B'.split(), 'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(), 'Quantity': [1, 3, 5, 1, 8, 1, 9, 3], 'Date': [ datetime(2013, 11, 1, 13, 0), datetime(2013, 9, 1, 13, 5), datetime(2013, 10, 1, 20, 0), datetime(2013, 10, 2, 10, 0), datetime(2013, 11, 1, 20, 0), datetime(2013, 10, 2, 10, 0), datetime(2013, 10, 2, 12, 0), datetime(2013, 12, 5, 14, 0) ], 'PayDay': [ datetime(2013, 10, 4, 0, 0), datetime(2013, 10, 15, 13, 5), datetime(2013, 9, 5, 20, 0), datetime(2013, 11, 2, 10, 0), datetime(2013, 10, 7, 20, 0), datetime(2013, 9, 5, 10, 0), datetime(2013, 12, 30, 12, 0), datetime(2013, 11, 20, 14, 0), ] }) result = pivot_table(df, index=Grouper(freq='M', key='Date'), columns=Grouper(freq='M', key='PayDay'), values='Quantity', aggfunc=np.sum) expected = DataFrame(np.array([ np.nan, 3, np.nan, np.nan, 6, np.nan, 1, 9, np.nan, 9, np.nan, np.nan, np.nan, np.nan, 3, np.nan ]).reshape(4, 4), index=[ datetime(2013, 9, 30), datetime(2013, 10, 31), datetime(2013, 11, 30), datetime(2013, 12, 31) ], columns=[ datetime(2013, 9, 30), datetime(2013, 10, 31), datetime(2013, 11, 30), datetime(2013, 12, 31) ]) expected.index.name = 'Date' expected.columns.name = 'PayDay' tm.assert_frame_equal(result, expected) result = pivot_table(df, index=Grouper(freq='M', key='PayDay'), columns=Grouper(freq='M', key='Date'), values='Quantity', aggfunc=np.sum) tm.assert_frame_equal(result, expected.T) tuples = [ (datetime(2013, 9, 30), datetime(2013, 10, 31)), (datetime(2013, 10, 31), datetime(2013, 9, 30)), (datetime(2013, 10, 31), datetime(2013, 11, 30)), (datetime(2013, 10, 31), datetime(2013, 12, 31)), (datetime(2013, 11, 30), datetime(2013, 10, 31)), (datetime(2013, 12, 31), datetime(2013, 11, 30)), ] idx = MultiIndex.from_tuples(tuples, names=['Date', 'PayDay']) expected = DataFrame(np.array( [3, np.nan, 6, np.nan, 1, np.nan, 9, np.nan, 9, np.nan, np.nan, 3]).reshape(6, 2), index=idx, columns=['A', 'B']) expected.columns.name = 'Branch' result = pivot_table(df, index=[ Grouper(freq='M', key='Date'), Grouper(freq='M', key='PayDay') ], columns=['Branch'], values='Quantity', aggfunc=np.sum) tm.assert_frame_equal(result, expected) result = pivot_table(df, index=['Branch'], columns=[ Grouper(freq='M', key='Date'), Grouper(freq='M', key='PayDay') ], values='Quantity', aggfunc=np.sum) tm.assert_frame_equal(result, expected.T)
def pivot(data_frames_dict, column_tuples, row_tuples, displayed_value_tuples, filters, ret, div_on_top=False, aggfunc=None): """ Returns a pivoted data frame data_frames_dict: a dictionary with csv paths as keys and Pandas.DataFrame as values column_tuples: ordered list of tuples (csv_path, name_of_chosen_column) row_tuples: ordered list of tuples (csv_path, name_of_chosen_column) """ from collections import OrderedDict if aggfunc is None: aggfunc = 'sum' output_data_frames = OrderedDict() # make list of needed columns second_elem = lambda tup: tup[1] row_names = [second_elem(row_tup) for row_tup in row_tuples] column_names = [second_elem(row_tup) for row_tup in column_tuples] # all_columns = row_names + column_names fitlered_data_frames = _apply_filters(data_frames_dict, filters) # for each displayed dimension create a data frame, then merge them before displaying for displayed_value_tuple in displayed_value_tuples: csv_path, displayed_column_name = displayed_value_tuple df_column_dict = defaultdict(set) df_column_dict[csv_path].add(displayed_column_name) for csv_path, column_name in column_tuples: df_column_dict[csv_path].add(column_name) for csv_path, row_name in row_tuples: df_column_dict[csv_path].add(row_name) for csv_path, merit_name in displayed_value_tuples: df_column_dict[csv_path].add(merit_name) # if columns from different tables -> join on the columns with the same # name needed_data_frames = df_column_dict.keys() if len(needed_data_frames) > 1: data_frame = _merge(fitlered_data_frames, df_column_dict) else: data_frame = fitlered_data_frames[df_column_dict.keys()[0]].reset_index() chosen_columns = reduce(lambda x, y: x.union(y), df_column_dict.values()) # data_frame.to_csv("./merged.csv") try: data_frame_with_dropped_columns = _drop(data_frame=data_frame, chosen_columns=chosen_columns) except Exception, e: raise DropException(str(e)) column_to_hex(data_frame, 'bb') column_to_hex(data_frame, 'bb.1') from pandas.tools.pivot import pivot_table data_frame = pivot_table(data_frame_with_dropped_columns, values=displayed_column_name, rows=row_names, cols=column_names, fill_value=0, aggfunc=aggfunc) output_data_frames[displayed_value_tuple] = data_frame print(data_frame) print("\n\n")
def runtests(var_file, out_file): pvalue_lines = [] input_file = "" with open(src_file, "rb") as sfile: for line in sfile.readlines(): input_file = os.path.abspath(line.strip()) model_variables = [] with open(var_file, "rb") as vfile: for line in vfile.readlines(): line = line.strip() if not line: continue model_variables.append(line.split()[0]) range_variables = [] with open(range_file, "rb") as rfile: for line in rfile.readlines(): line = line.strip() if not line: continue parts = line.strip().split() if 2 < len(parts): range_variables.append({"name":parts[0], "type":parts[1], "range":parts[2].split(",")}) ignore_records = [] with open(ignore_file, "rb") as rfile: for line in rfile.readlines(): line = line.strip() if not line: continue ignore_records.append(line) all_data = {} for var in model_variables: all_data[var] = [] with open(input_file, "rb") as ifile: reader = csv.reader(ifile) titles = reader.next() model_idx = [titles.index(var) for var in model_variables] r0 = 0 r = 0 for row in reader: if row[0] in ignore_records: continue r0 += 1 # Starts at 1, because of titles all_missing = True some_missing = False missing_dvar = row[model_idx[0]] == "\\N" for i in range(1, len(model_variables)): var_idx = model_idx[i] if row[var_idx] == "\\N": some_missing = True else: all_missing = False inside_range = True for var in range_variables: idx = titles.index(var["name"]) val = row[idx] if val == "\\N": continue vtype = var["type"] vrang = var["range"] test = True if vtype == "category": test = val in vrang else: test = float(vrang[0]) <= float(val) and float(val) < float(vrang[1]) inside_range = inside_range and test if not all_missing and not missing_dvar and inside_range: for i in range(0, len(model_variables)): var = model_variables[i] idx = model_idx[i] try: val = float(row[idx]) except ValueError: val = np.NaN all_data[var].append(val) r += 1 data = pd.DataFrame(all_data) cat_vars = [] num_vars = [] count = 0 with open(var_file, "rb") as vfile: for line in vfile.readlines(): line = line.strip() if not line: continue parts = line.split() if count == 0: dvar = parts[0] count = 1 continue count += 1 if parts[1] == "category": cat_vars.append(parts[0]) else: num_vars.append(parts[0]) for var in cat_vars: print "***********************" print var dat = data.loc[:,(var, dvar)] dat["VALUES"] = pd.Series(np.ones(len(dat[var])), index=dat.index) # http://pandas.pydata.org/pandas-docs/stable/generated/pandas.pivot_table.html counts = pivot_table(dat, values="VALUES", index=[var], columns=[dvar], aggfunc=np.sum, fill_value=0) fisher_ratio, fisher_pvalue = fisher_exact(counts) print counts pvalue_lines.append(var + "\t" + str(fisher_pvalue)) print fisher_ratio, fisher_pvalue for var in num_vars: values0 = data[data[dvar] == 0][var] values1 = data[data[dvar] == 1][var] values0 = values0[~np.isnan(values0)] values1 = values1[~np.isnan(values1)] print "***********************" print var print "mean/std for",dvar,"0:",np.mean(values0), "/", np.std(values0) print "mean/std for",dvar,"1:",np.mean(values1), "/", np.std(values1) ttest_stat, ttest_pvalue = ttest_ind(values0, values1, equal_var=False) print "Means are different at p-value",ttest_pvalue pvalue_lines.append(var + "\t" + str(ttest_pvalue)) with open(out_file, "w") as ofile: for line in pvalue_lines: ofile.write(line + "\n")
def test_pivot_multi_values(self): result = pivot_table(self.data, values=["D", "E"], index="A", columns=["B", "C"], fill_value=0) expected = pivot_table(self.data.drop(["F"], axis=1), index="A", columns=["B", "C"], fill_value=0) tm.assert_frame_equal(result, expected)
def test_pivot_table_multiple(self): rows = ['A', 'B'] cols= 'C' table = pivot_table(self.data, rows=rows, cols=cols) expected = self.data.groupby(rows + [cols]).agg(np.mean).unstack() tm.assert_frame_equal(table, expected)
def test_pivot_dtaccessor(self): # GH 8103 dates1 = [ "2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00", "2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00", ] dates2 = [ "2013-01-01 15:00:00", "2013-01-01 15:00:00", "2013-01-01 15:00:00", "2013-02-01 15:00:00", "2013-02-01 15:00:00", "2013-02-01 15:00:00", ] df = DataFrame( { "label": ["a", "a", "a", "b", "b", "b"], "dt1": dates1, "dt2": dates2, "value1": np.arange(6, dtype="int64"), "value2": [1, 2] * 3, } ) df["dt1"] = df["dt1"].apply(lambda d: pd.Timestamp(d)) df["dt2"] = df["dt2"].apply(lambda d: pd.Timestamp(d)) result = pivot_table(df, index="label", columns=df["dt1"].dt.hour, values="value1") exp_idx = Index(["a", "b"], name="label") expected = DataFrame({7: [0, 3], 8: [1, 4], 9: [2, 5]}, index=exp_idx, columns=Index([7, 8, 9], name="dt1")) tm.assert_frame_equal(result, expected) result = pivot_table(df, index=df["dt2"].dt.month, columns=df["dt1"].dt.hour, values="value1") expected = DataFrame( {7: [0, 3], 8: [1, 4], 9: [2, 5]}, index=Index([1, 2], name="dt2"), columns=Index([7, 8, 9], name="dt1") ) tm.assert_frame_equal(result, expected) result = pivot_table( df, index=df["dt2"].dt.year.values, columns=[df["dt1"].dt.hour, df["dt2"].dt.month], values="value1" ) exp_col = MultiIndex.from_arrays([[7, 7, 8, 8, 9, 9], [1, 2] * 3], names=["dt1", "dt2"]) expected = DataFrame(np.array([[0, 3, 1, 4, 2, 5]], dtype="int64"), index=[2013], columns=exp_col) tm.assert_frame_equal(result, expected) result = pivot_table( df, index=np.array(["X", "X", "X", "X", "Y", "Y"]), columns=[df["dt1"].dt.hour, df["dt2"].dt.month], values="value1", ) expected = DataFrame( np.array([[0, 3, 1, np.nan, 2, np.nan], [np.nan, np.nan, np.nan, 4, np.nan, 5]]), index=["X", "Y"], columns=exp_col, ) tm.assert_frame_equal(result, expected)
def test_pivot_multi_values(self): result = pivot_table(self.data, values=['D', 'E'], rows='A', cols=['B', 'C']) expected = pivot_table(self.data.drop(['F'], axis=1), rows='A', cols=['B', 'C']) assert_frame_equal(result, expected)
def test_pivot_table_multiple(self): index = ['A', 'B'] columns = 'C' table = pivot_table(self.data, index=index, columns=columns) expected = self.data.groupby(index + [columns]).agg(np.mean).unstack() tm.assert_frame_equal(table, expected)
def test_pivot_table_multiple(self): rows = ['A', 'B'] cols = 'C' table = pivot_table(self.data, rows=rows, cols=cols) expected = self.data.groupby(rows + [cols]).agg(np.mean).unstack() tm.assert_frame_equal(table, expected)
def runtests(var_file, out_file): pvalue_lines = [] input_file = "" with open(src_file, "rb") as sfile: for line in sfile.readlines(): input_file = os.path.abspath(line.strip()) model_variables = [] with open(var_file, "rb") as vfile: for line in vfile.readlines(): line = line.strip() if not line: continue model_variables.append(line.split()[0]) range_variables = [] with open(range_file, "rb") as rfile: for line in rfile.readlines(): line = line.strip() if not line: continue parts = line.strip().split() if 2 < len(parts): range_variables.append({ "name": parts[0], "type": parts[1], "range": parts[2].split(",") }) ignore_records = [] with open(ignore_file, "rb") as rfile: for line in rfile.readlines(): line = line.strip() if not line: continue ignore_records.append(line) all_data = {} for var in model_variables: all_data[var] = [] with open(input_file, "rb") as ifile: reader = csv.reader(ifile) titles = reader.next() model_idx = [titles.index(var) for var in model_variables] r0 = 0 r = 0 for row in reader: if row[0] in ignore_records: continue r0 += 1 # Starts at 1, because of titles all_missing = True some_missing = False missing_dvar = row[model_idx[0]] == "\\N" for i in range(1, len(model_variables)): var_idx = model_idx[i] if row[var_idx] == "\\N": some_missing = True else: all_missing = False inside_range = True for var in range_variables: idx = titles.index(var["name"]) val = row[idx] if val == "\\N": continue vtype = var["type"] vrang = var["range"] test = True if vtype == "category": test = val in vrang else: test = float( vrang[0]) <= float(val) and float(val) < float( vrang[1]) inside_range = inside_range and test if not all_missing and not missing_dvar and inside_range: for i in range(0, len(model_variables)): var = model_variables[i] idx = model_idx[i] try: val = float(row[idx]) except ValueError: val = np.NaN all_data[var].append(val) r += 1 data = pd.DataFrame(all_data) cat_vars = [] num_vars = [] count = 0 with open(var_file, "rb") as vfile: for line in vfile.readlines(): line = line.strip() if not line: continue parts = line.split() if count == 0: dvar = parts[0] count = 1 continue count += 1 if parts[1] == "category": cat_vars.append(parts[0]) else: num_vars.append(parts[0]) for var in cat_vars: print "***********************" print var dat = data.loc[:, (var, dvar)] dat["VALUES"] = pd.Series(np.ones(len(dat[var])), index=dat.index) # http://pandas.pydata.org/pandas-docs/stable/generated/pandas.pivot_table.html counts = pivot_table(dat, values="VALUES", index=[var], columns=[dvar], aggfunc=np.sum, fill_value=0) fisher_ratio, fisher_pvalue = fisher_exact(counts) print counts pvalue_lines.append(var + "\t" + str(fisher_pvalue)) print fisher_ratio, fisher_pvalue for var in num_vars: values0 = data[data[dvar] == 0][var] values1 = data[data[dvar] == 1][var] values0 = values0[~np.isnan(values0)] values1 = values1[~np.isnan(values1)] print "***********************" print var print "mean/std for", dvar, "0:", np.mean(values0), "/", np.std( values0) print "mean/std for", dvar, "1:", np.mean(values1), "/", np.std( values1) ttest_stat, ttest_pvalue = ttest_ind(values0, values1, equal_var=False) print "Means are different at p-value", ttest_pvalue pvalue_lines.append(var + "\t" + str(ttest_pvalue)) with open(out_file, "w") as ofile: for line in pvalue_lines: ofile.write(line + "\n")
def test_pivot_timegrouper(self): df = DataFrame({ 'Branch' : 'A A A A A A A B'.split(), 'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(), 'Quantity': [1, 3, 5, 1, 8, 1, 9, 3], 'Date' : [datetime.datetime(2013, 1, 1), datetime.datetime(2013, 1, 1), datetime.datetime(2013, 10, 1), datetime.datetime(2013, 10, 2), datetime.datetime(2013, 10, 1), datetime.datetime(2013, 10, 2), datetime.datetime(2013, 12, 2), datetime.datetime(2013, 12, 2),]}).set_index('Date') expected = DataFrame(np.array([10, 18, 3],dtype='int64').reshape(1, 3), index=[datetime.datetime(2013, 12, 31)], columns='Carl Joe Mark'.split()) expected.index.name = 'Date' expected.columns.name = 'Buyer' result = pivot_table(df, index=Grouper(freq='A'), columns='Buyer', values='Quantity', aggfunc=np.sum) tm.assert_frame_equal(result,expected) result = pivot_table(df, index='Buyer', columns=Grouper(freq='A'), values='Quantity', aggfunc=np.sum) tm.assert_frame_equal(result,expected.T) expected = DataFrame(np.array([1, np.nan, 3, 9, 18, np.nan]).reshape(2, 3), index=[datetime.datetime(2013, 1, 1), datetime.datetime(2013, 7, 1)], columns='Carl Joe Mark'.split()) expected.index.name = 'Date' expected.columns.name = 'Buyer' result = pivot_table(df, index=Grouper(freq='6MS'), columns='Buyer', values='Quantity', aggfunc=np.sum) tm.assert_frame_equal(result, expected) result = pivot_table(df, index='Buyer', columns=Grouper(freq='6MS'), values='Quantity', aggfunc=np.sum) tm.assert_frame_equal(result, expected.T) # passing the name df = df.reset_index() result = pivot_table(df, index=Grouper(freq='6MS', key='Date'), columns='Buyer', values='Quantity', aggfunc=np.sum) tm.assert_frame_equal(result, expected) result = pivot_table(df, index='Buyer', columns=Grouper(freq='6MS', key='Date'), values='Quantity', aggfunc=np.sum) tm.assert_frame_equal(result, expected.T) self.assertRaises(KeyError, lambda : pivot_table(df, index=Grouper(freq='6MS', key='foo'), columns='Buyer', values='Quantity', aggfunc=np.sum)) self.assertRaises(KeyError, lambda : pivot_table(df, index='Buyer', columns=Grouper(freq='6MS', key='foo'), values='Quantity', aggfunc=np.sum)) # passing the level df = df.set_index('Date') result = pivot_table(df, index=Grouper(freq='6MS', level='Date'), columns='Buyer', values='Quantity', aggfunc=np.sum) tm.assert_frame_equal(result, expected) result = pivot_table(df, index='Buyer', columns=Grouper(freq='6MS', level='Date'), values='Quantity', aggfunc=np.sum) tm.assert_frame_equal(result, expected.T) self.assertRaises(ValueError, lambda : pivot_table(df, index=Grouper(freq='6MS', level='foo'), columns='Buyer', values='Quantity', aggfunc=np.sum)) self.assertRaises(ValueError, lambda : pivot_table(df, index='Buyer', columns=Grouper(freq='6MS', level='foo'), values='Quantity', aggfunc=np.sum)) # double grouper df = DataFrame({ 'Branch' : 'A A A A A A A B'.split(), 'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(), 'Quantity': [1,3,5,1,8,1,9,3], 'Date' : [datetime.datetime(2013,11,1,13,0), datetime.datetime(2013,9,1,13,5), datetime.datetime(2013,10,1,20,0), datetime.datetime(2013,10,2,10,0), datetime.datetime(2013,11,1,20,0), datetime.datetime(2013,10,2,10,0), datetime.datetime(2013,10,2,12,0), datetime.datetime(2013,12,5,14,0)], 'PayDay' : [datetime.datetime(2013,10,4,0,0), datetime.datetime(2013,10,15,13,5), datetime.datetime(2013,9,5,20,0), datetime.datetime(2013,11,2,10,0), datetime.datetime(2013,10,7,20,0), datetime.datetime(2013,9,5,10,0), datetime.datetime(2013,12,30,12,0), datetime.datetime(2013,11,20,14,0),]}) result = pivot_table(df, index=Grouper(freq='M', key='Date'), columns=Grouper(freq='M', key='PayDay'), values='Quantity', aggfunc=np.sum) expected = DataFrame(np.array([np.nan, 3, np.nan, np.nan, 6, np.nan, 1, 9, np.nan, 9, np.nan, np.nan, np.nan, np.nan, 3, np.nan]).reshape(4, 4), index=[datetime.datetime(2013, 9, 30), datetime.datetime(2013, 10, 31), datetime.datetime(2013, 11, 30), datetime.datetime(2013, 12, 31)], columns=[datetime.datetime(2013, 9, 30), datetime.datetime(2013, 10, 31), datetime.datetime(2013, 11, 30), datetime.datetime(2013, 12, 31)]) expected.index.name = 'Date' expected.columns.name = 'PayDay' tm.assert_frame_equal(result, expected) result = pivot_table(df, index=Grouper(freq='M', key='PayDay'), columns=Grouper(freq='M', key='Date'), values='Quantity', aggfunc=np.sum) tm.assert_frame_equal(result, expected.T) tuples = [(datetime.datetime(2013, 9, 30), datetime.datetime(2013, 10, 31)), (datetime.datetime(2013, 10, 31), datetime.datetime(2013, 9, 30)), (datetime.datetime(2013, 10, 31), datetime.datetime(2013, 11, 30)), (datetime.datetime(2013, 10, 31), datetime.datetime(2013, 12, 31)), (datetime.datetime(2013, 11, 30), datetime.datetime(2013, 10, 31)), (datetime.datetime(2013, 12, 31), datetime.datetime(2013, 11, 30)),] idx = MultiIndex.from_tuples(tuples, names=['Date', 'PayDay']) expected = DataFrame(np.array([3, np.nan, 6, np.nan, 1, np.nan, 9, np.nan, 9, np.nan, np.nan, 3]).reshape(6, 2), index=idx, columns=['A', 'B']) expected.columns.name = 'Branch' result = pivot_table(df, index=[Grouper(freq='M', key='Date'), Grouper(freq='M', key='PayDay')], columns=['Branch'], values='Quantity', aggfunc=np.sum) tm.assert_frame_equal(result, expected) result = pivot_table(df, index=['Branch'], columns=[Grouper(freq='M', key='Date'), Grouper(freq='M', key='PayDay')], values='Quantity', aggfunc=np.sum) tm.assert_frame_equal(result, expected.T)
fig, ax = plt.subplots(figsize=(10, 6)) ax.set_title(u'업종분포', fontproperties=fontprop) pos = arange(20) pos = pos[::-1] # reverse pos list plt.yticks(pos, [x.decode('utf8') for x in top20.index], fontproperties=fontprop) plt.barh(pos, top20.values, align='center', color=colors_list, alpha=0.7) # plt.show() df_semi = df[df['sector']=='소프트웨어'] print(df_semi.head(10)) #시가총액 합산 from pandas.tools.pivot import pivot_table ttable = df[['sector', 'marcap']] piv = pivot_table(ttable, values='marcap', rows=['sector'], aggfunc=np.sum) sector_marcap = piv.copy() sector_marcap.sort(ascending=False) print(sector_marcap[:10]) #업종별 시가총액 차트 import matplotlib.pylab as plt import matplotlib.font_manager as fm fontprop = fm.FontProperties(fname="fonts/malgun.ttf") top20 = sector_marcap[0:20] fig, ax = plt.subplots(figsize=(10, 6)) ax.set_title(u'업종별 시가총액', fontproperties=fontprop) pos = arange(20) pos = pos[::-1] # reverse pos list
def test_pivot_timegrouper(self): df = DataFrame( { "Branch": "A A A A A A A B".split(), "Buyer": "Carl Mark Carl Carl Joe Joe Joe Carl".split(), "Quantity": [1, 3, 5, 1, 8, 1, 9, 3], "Date": [ datetime(2013, 1, 1), datetime(2013, 1, 1), datetime(2013, 10, 1), datetime(2013, 10, 2), datetime(2013, 10, 1), datetime(2013, 10, 2), datetime(2013, 12, 2), datetime(2013, 12, 2), ], } ).set_index("Date") expected = DataFrame( np.array([10, 18, 3], dtype="int64").reshape(1, 3), index=[datetime(2013, 12, 31)], columns="Carl Joe Mark".split(), ) expected.index.name = "Date" expected.columns.name = "Buyer" result = pivot_table(df, index=Grouper(freq="A"), columns="Buyer", values="Quantity", aggfunc=np.sum) tm.assert_frame_equal(result, expected) result = pivot_table(df, index="Buyer", columns=Grouper(freq="A"), values="Quantity", aggfunc=np.sum) tm.assert_frame_equal(result, expected.T) expected = DataFrame( np.array([1, np.nan, 3, 9, 18, np.nan]).reshape(2, 3), index=[datetime(2013, 1, 1), datetime(2013, 7, 1)], columns="Carl Joe Mark".split(), ) expected.index.name = "Date" expected.columns.name = "Buyer" result = pivot_table(df, index=Grouper(freq="6MS"), columns="Buyer", values="Quantity", aggfunc=np.sum) tm.assert_frame_equal(result, expected) result = pivot_table(df, index="Buyer", columns=Grouper(freq="6MS"), values="Quantity", aggfunc=np.sum) tm.assert_frame_equal(result, expected.T) # passing the name df = df.reset_index() result = pivot_table( df, index=Grouper(freq="6MS", key="Date"), columns="Buyer", values="Quantity", aggfunc=np.sum ) tm.assert_frame_equal(result, expected) result = pivot_table( df, index="Buyer", columns=Grouper(freq="6MS", key="Date"), values="Quantity", aggfunc=np.sum ) tm.assert_frame_equal(result, expected.T) self.assertRaises( KeyError, lambda: pivot_table( df, index=Grouper(freq="6MS", key="foo"), columns="Buyer", values="Quantity", aggfunc=np.sum ), ) self.assertRaises( KeyError, lambda: pivot_table( df, index="Buyer", columns=Grouper(freq="6MS", key="foo"), values="Quantity", aggfunc=np.sum ), ) # passing the level df = df.set_index("Date") result = pivot_table( df, index=Grouper(freq="6MS", level="Date"), columns="Buyer", values="Quantity", aggfunc=np.sum ) tm.assert_frame_equal(result, expected) result = pivot_table( df, index="Buyer", columns=Grouper(freq="6MS", level="Date"), values="Quantity", aggfunc=np.sum ) tm.assert_frame_equal(result, expected.T) self.assertRaises( ValueError, lambda: pivot_table( df, index=Grouper(freq="6MS", level="foo"), columns="Buyer", values="Quantity", aggfunc=np.sum ), ) self.assertRaises( ValueError, lambda: pivot_table( df, index="Buyer", columns=Grouper(freq="6MS", level="foo"), values="Quantity", aggfunc=np.sum ), ) # double grouper df = DataFrame( { "Branch": "A A A A A A A B".split(), "Buyer": "Carl Mark Carl Carl Joe Joe Joe Carl".split(), "Quantity": [1, 3, 5, 1, 8, 1, 9, 3], "Date": [ datetime(2013, 11, 1, 13, 0), datetime(2013, 9, 1, 13, 5), datetime(2013, 10, 1, 20, 0), datetime(2013, 10, 2, 10, 0), datetime(2013, 11, 1, 20, 0), datetime(2013, 10, 2, 10, 0), datetime(2013, 10, 2, 12, 0), datetime(2013, 12, 5, 14, 0), ], "PayDay": [ datetime(2013, 10, 4, 0, 0), datetime(2013, 10, 15, 13, 5), datetime(2013, 9, 5, 20, 0), datetime(2013, 11, 2, 10, 0), datetime(2013, 10, 7, 20, 0), datetime(2013, 9, 5, 10, 0), datetime(2013, 12, 30, 12, 0), datetime(2013, 11, 20, 14, 0), ], } ) result = pivot_table( df, index=Grouper(freq="M", key="Date"), columns=Grouper(freq="M", key="PayDay"), values="Quantity", aggfunc=np.sum, ) expected = DataFrame( np.array( [np.nan, 3, np.nan, np.nan, 6, np.nan, 1, 9, np.nan, 9, np.nan, np.nan, np.nan, np.nan, 3, np.nan] ).reshape(4, 4), index=[datetime(2013, 9, 30), datetime(2013, 10, 31), datetime(2013, 11, 30), datetime(2013, 12, 31)], columns=[datetime(2013, 9, 30), datetime(2013, 10, 31), datetime(2013, 11, 30), datetime(2013, 12, 31)], ) expected.index.name = "Date" expected.columns.name = "PayDay" tm.assert_frame_equal(result, expected) result = pivot_table( df, index=Grouper(freq="M", key="PayDay"), columns=Grouper(freq="M", key="Date"), values="Quantity", aggfunc=np.sum, ) tm.assert_frame_equal(result, expected.T) tuples = [ (datetime(2013, 9, 30), datetime(2013, 10, 31)), (datetime(2013, 10, 31), datetime(2013, 9, 30)), (datetime(2013, 10, 31), datetime(2013, 11, 30)), (datetime(2013, 10, 31), datetime(2013, 12, 31)), (datetime(2013, 11, 30), datetime(2013, 10, 31)), (datetime(2013, 12, 31), datetime(2013, 11, 30)), ] idx = MultiIndex.from_tuples(tuples, names=["Date", "PayDay"]) expected = DataFrame( np.array([3, np.nan, 6, np.nan, 1, np.nan, 9, np.nan, 9, np.nan, np.nan, 3]).reshape(6, 2), index=idx, columns=["A", "B"], ) expected.columns.name = "Branch" result = pivot_table( df, index=[Grouper(freq="M", key="Date"), Grouper(freq="M", key="PayDay")], columns=["Branch"], values="Quantity", aggfunc=np.sum, ) tm.assert_frame_equal(result, expected) result = pivot_table( df, index=["Branch"], columns=[Grouper(freq="M", key="Date"), Grouper(freq="M", key="PayDay")], values="Quantity", aggfunc=np.sum, ) tm.assert_frame_equal(result, expected.T)