Example #1
0
df['data1'].groupby(df['key1'])
df[['data2']].groupby(df['key1'])

df.groupby(['key1', 'key2'])[['data2']].mean()

s_grouped = df.groupby(['key1', 'key2'])['data2']
s_grouped

s_grouped.mean()


# ### 通过字典或series进行分组
people = DataFrame(np.random.randn(5, 5),
                   columns=['a', 'b', 'c', 'd', 'e'],
                   index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
people.ix[2:3, ['b', 'c']] = np.nan # Add a few NA values
people

mapping = {'a': 'red', 'b': 'red', 'c': 'blue',
           'd': 'blue', 'e': 'red', 'f' : 'orange'}

by_column = people.groupby(mapping, axis=1)
by_column.sum()

map_series = Series(mapping)
map_series

people.groupby(map_series, axis=1).count()


# ### 通过函数进行分组
Example #2
0
## 对分组进行迭代
for name, group in df.groupby('key1'):
	print name
	print group
# 多键分组的迭代
for (key1,key2), group in df.groupby(['key1','key2']):
	print k1,k2
## 选取一个或一组列
df.groupby('key1')['data1'].mean()
df.groupby('key1')[['data1']].mean()
## 通过字典或Series进行分组
people = DataFrame(np.random.randn(5,5),
	columns=['a','b','c','d','e'],
	index=['Joe','Steve','Wes','Jim','Travis'])
people.ix[2:3, ['b','c']] = np.nan
mapping = {'a':'red','b':'red','c':'blue','d':'blue','e':'red','f':'orange'}
by_columns = people.groupby(mapping, axis=1)
by_columns.sum()

## 数据聚合
grouped = df.groupby('key1')
grouped['data1'].quantile(0.9)
def peak_to_peak(arr):
	return arr.max() -arr.min()
grouped.agg(peak_to_peak)
grouped.describe()  

tips = pd.read_csv('ch08/tips.csv')
tips.tip_pct = tips.tip / tips.total_bill
# 这样没法创建新列
Example #3
0
	print 'the value is:\n',group 



people = DataFrame(np.random.randn(5,5),columns=['a','b','c','d','e'],index=['Joe','Steve','Wes','Jim','Travis'])#索引值为人的名字
print people
"""
               a         b         c         d         e
Joe     1.304699  0.100459 -0.000408 -1.095217 -1.142781
Steve  -1.224551  0.478045 -1.328901 -0.365792 -1.339277
Wes     0.330814 -0.768008 -0.599442 -0.854585 -0.174300
Jim     0.701609 -1.466142 -0.207906 -0.870489  0.963129
Travis -2.215134 -0.821001  0.361285 -0.935930 -0.472026

"""
people.ix[2:3,['b','c']] = np.nan #add some NA value
print people
"""
               a         b         c         d         e
Joe     1.304699  0.100459 -0.000408 -1.095217 -1.142781
Steve  -1.224551  0.478045 -1.328901 -0.365792 -1.339277
Wes     0.330814       NaN       NaN -0.854585 -0.174300
Jim     0.701609 -1.466142 -0.207906 -0.870489  0.963129
Travis -2.215134 -0.821001  0.361285 -0.935930 -0.472026

"""
mapping = {'a':'red','b':'red','c':'blue','d':'blue','e':'red','f':'orange'}
#那么分组是可以按照index(索引行)或者column(列)来进行分组的,那么默认情况下是按照index来进行分组的,那么如果想指定column进行分组的话,需要设置axis参数为1
by_column = people.groupby(mapping,axis=1)

print by_column.sum()#做一次列的汇总求和