Example #1
0
d.insert(1,'bar',d['b']) # args posn,lbl,data
d6=d.reindex(index=dt[[0,1,4]],columns=list(d.columns)+['E']) # can modify row/col names (can extract data and construct new df);
d.rename(columns={'one' : 'foo','two' : 'bar'},index={'a' : 'apple','b' : 'banana','d' : 'durian'}) # rename
# pd.DataFrame(np.asarray(d),index=new_index,columns=new_cols); # inefficient but works; d.index=xx; d.columns=xx; d.name=xx;
d7=pd.DataFrame({'key':['fo','fo'],'val1':[1,2]}); d8=pd.DataFrame({'key':['fo','fo'],'val2':[4,5]}); pd.merge(d7,d8,on='key') # sql-like merge,very high eff;
d.combine_first(d2) # ~fill_nan pref1,pref2, ~d(isnan(d))=d2(isnan(d));

# process nan
d[0<d]; # NaN's if no data
d[0<d.a]; d[0<d.iloc[:,0]]; # d(d(:,1)<0,:) select rows
d.dropna(how='any'); d.fillna(value=5); pd.isnull(d)
# f=lambda x:x.fillna(x.mean()); grp=xx; d3=grp.transform(f) # fill with grp mean

# stat,grouping
d.mean(1); ts.value_counts(); # mean etc excludes missing data
d.apply(np.cumsum); d.apply(lambda x:x.max()-x.min());

d9=pd.DataFrame({'A':['fo','ba','fo','ba','fo','ba','fo','fo'],'B':['a','a','b','c','b','b','a','c'],'C':randn(8),'D':randn(8)})
d9.groupby(['A','B']).sum()
d.sub(d['a'],axis=0) # subtract col A; also math ops and &|
d9=pd.Series(np.random.randn(100)); factor=pd.qcut(d9,[0,.25,.5,.75,1.]); d9.groupby(factor).mean() # quintile mean
# d.groupby(level=['A','B']); df.groupby(fn1,axis=1).groups
ctry=np.array(['US','UK','GR','JP']); key=ctry[np.random.randint(0,4,1000)]; d2=pd.DataFrame(randn(1e3),index=key);
grp=d2.groupby(key); grp.count(); grp.mean(); grp.agg(lambda x:x.std()); d2[key=='JP'].apply(lambda x:x.describe()) # grp['JP'].apply(lambda x:x.describe())
grp.keys; grp.indices;

# filter_on_group (A(0<A) etc),apply(f)
d=pd.DataFrame({'A':np.arange(8),'B':list('aabbbbcc')})
d.groupby('B').filter(lambda x:2<len(x),dropna=F); # (lambda x:2<x.sum()); drop unwanted data
def f(grp): return pd.DataFrame({'original':grp,'demeaned':grp-grp.mean()})
d=d0.copy(); d['A']=[1,1,2,2,3,3]; d.groupby('A')['C'].apply(f)