imp_feat_lig3 = np.genfromtxt(home_folder+'imp_files/imp_feat_lig3.txt', dtype=int) imp_feat_lig2 = np.genfromtxt(home_folder+'imp_files/imp_feat_lig2.txt', dtype=int) prodes3 = np.genfromtxt(home_folder+'imp_files/prodes3.txt') prodes2 = np.genfromtxt(home_folder+'imp_files/prodes2.txt') # classifiers clfRF3 = joblib.load(home_folder+"clf_files/clfRF.pkl") clfRF2 = joblib.load(home_folder+"clf_files/clfRF2.pkl") uniT = [] temp = [] for i in set(uniprotTP): uniT.append(i) temp.append(uniprotTP.index(i)) idx = np.asarray(temp) pd2 = prodes2[idx] #(1473, 35) from collections import defaultdict r2 = defaultdict(lambda : defaultdict(float)) r3 = defaultdict(lambda : defaultdict(float)) from datetime import datetime #print "Your job started at ", datetime.now() t1 = datetime.now() for m in mols: #lig = m.GetProp('DATABASE_ID') lig = m.GetProp('_Name')
# multi_idx,convert to cols, pivot table: xls_tbl1,graphically set up summary_tbl2="pivot table" (it pivots/rotates following chg in graph setup) d5=pd.DataFrame({'A':['one','one','two','three']*3,'B':['a','b','c']*4,'C':['foo','foo','foo','bar','bar','bar']*2,'D':np.random.randn(12),'E':np.random.randn(12)}) tp1=zip(*[['bar','bar','baz','baz','foo','foo','qux','qux'],['one','two','one','two','one','two','one','two']]) ix1=pd.MultiIndex.from_tuples(tp1,names=['first','second']); d5=pd.DataFrame(randn(8,4),index=ix1,columns=['A','B','C','D']); d6=d5[:4]; d7=d5.stack(); d7.unstack(); # multi-idx can be converted to cols ###? result.columns.levels # labels for multi-index; multi-ix order matters # pd.pivot_table(d5,values='D',rows=['A','B'],cols=['C']) # summary_table - grp by A,B,C # concat df d=d0.copy(); pd.concat([d[:2],d[2:5],d[5:]]) # rows pd.concat([d.ix[:,'A':'B'],d.ix[1:3,'C':'D']],axis=1) # cols; note df_single_col=TimeSeries # o1=pd.concat([p1,p2,p3],keys=['first','second','third'],join='outer') # generates hierarchial_multi-index (multi-ix order matters); can use multiple_keys,dict etc; d=pd.DataFrame(randn(10,4),columns=['a','b','c','d'],index=[pd.core.common.rands(5) for _ in xrange(10)]) # rand_strings pd.concat([d.ix[:7,['a','b']],d.ix[2:-2,['c']],d.ix[-7:,['d']]],axis=1,join_axes=[d.index]) # ix_orig (othw ix_sorted) pd.concat([d.ix[:7,['a','b']],d.ix[2:-2,['c']],d.ix[-7:,['d']]],join='inner') # add_row/col,copy,reindex,sql-like merge,fill_nan ts2=pd.Series([1,3,5,np.nan,6,8],index=dt[:6]); d.append([d.ix[1,],d.ix[0,]]); d.append(ts2.T,ignore_index=True); # d is NOT modified; append rows broken??? d.loc[:,'d']=np.array([5]*len(d)); d['g']=ts2[0:4] # cols; data outside of "master date list" is lost d5=d.copy(); d6=d4.pop('C'); del d['g'] d.insert(1,'bar',d['b']) # args posn,lbl,data d6=d.reindex(index=dt[[0,1,4]],columns=list(d.columns)+['E']) # can modify row/col names (can extract data and construct new df); d.rename(columns={'one' : 'foo','two' : 'bar'},index={'a' : 'apple','b' : 'banana','d' : 'durian'}) # rename # pd.DataFrame(np.asarray(d),index=new_index,columns=new_cols); # inefficient but works; d.index=xx; d.columns=xx; d.name=xx; d7=pd.DataFrame({'key':['fo','fo'],'val1':[1,2]}); d8=pd.DataFrame({'key':['fo','fo'],'val2':[4,5]}); pd.merge(d7,d8,on='key') # sql-like merge,very high eff; d.combine_first(d2) # ~fill_nan pref1,pref2, ~d(isnan(d))=d2(isnan(d)); # process nan d[0<d]; # NaN's if no data d[0<d.a]; d[0<d.iloc[:,0]]; # d(d(:,1)<0,:) select rows d.dropna(how='any'); d.fillna(value=5); pd.isnull(d) # f=lambda x:x.fillna(x.mean()); grp=xx; d3=grp.transform(f) # fill with grp mean