def runHMM(h, stepS=0.05, eps=1e-1,CD=None,E=None,save=True,verbose=1): if CD is None: CD = pd.read_pickle(utl.outpath + 'real/CDEidx.df').iloc[:] if E is None: E = pd.read_pickle(utl.outpath + 'real/Emissions.df') likes_null = getNullLikelihoods(CD,E) likes_thn = mkv.computeLikelihoodReal((CD, E, -stepS, h)) likes_thp = mkv.computeLikelihoodReal((CD[likes_null > likes_thn], E, stepS, h)); neg = likes_thn[likes_null <= likes_thn]; zero = likes_null.loc[(likes_null.loc[likes_thp.index] >= likes_thp).replace({False: None}).dropna().index]; pos = likes_thp.loc[(likes_null.loc[likes_thp.index] < likes_thp).replace({False: None}).dropna().index]; if verbose>0: print 'N={}\t Null={} ({:.0f}\%)\t Pos={}\t Neg={}'.format(CD.shape[0], zero.size, zero.size / float(CD.shape[0]) * 100, pos.size, neg.size); sys.stdout.flush() dfz = pd.DataFrame(zero.values, index=zero.index, columns=['alt']); dfz['s'] = 0 dfn = findML(neg, -stepS, CD.loc[neg.index], E, h, eps, stepS) dfp = findML(pos, stepS, CD.loc[pos.index], E, h, eps,stepS) df = pd.concat([dfp, dfz, dfn]) df = pd.concat([df, likes_null], axis=1) df.columns = pd.MultiIndex.from_product([[h], df.columns], names=['h', 'stat']) if save: path = utl.outpath + 'real/HMM/' utl.mkdir(path) df.to_pickle(path + 'h{:E}.df'.format(h)) return df
def getNullLikelihoods(CD,E): # print 'Computing NullLikelihoods...'; sys.stdout.flush() # null_filename = utl.outpath + 'real/HMM/null.df' likes_null = mkv.computeLikelihoodReal((CD, E, 0, 0.5)) likes_null.name = 'null' return likes_null
def getNullLikelihoods37(): # print 'Computing NullLikelihoods...'; sys.stdout.flush() CD = pd.read_pickle(utl.outpath + 'real/CDEidx.df') CD=CD.loc[:,CD.columns.get_level_values('GEN')!=59] i=pd.read_csv('/home/arya/workspace/CLEAR/sample_data/popoolation2/F37.sync',sep='\t',header=None).set_index([0,1]).sort_index().iloc[:100] CD=CD.loc[i.index].sort_index() E = pd.read_pickle(utl.outpath + 'real/Emissions.df'); # null_filename = utl.outpath + 'real/HMM/null.df' likes_null = mkv.computeLikelihoodReal((CD, E, 0, 0.5)) likes_null.name = 'null' return likes_null
def findML(init, init_s, cd, E, h, eps,stepS=0.05): S = np.arange(0, np.sign(init_s) * 1.0001, np.sign(init_s) * stepS)[2:] i = pd.Series(True, index=init.index).values; mlprev = init.values.copy(True); mlcurrent = init.values.copy(True) mle = np.ones(mlcurrent.size) * init_s; ml = init.values.copy(True) for s in S: mlprev[i] = mlcurrent[i] mlcurrent[i] = mkv.computeLikelihoodReal((cd[i], E, s, h)) i = mlcurrent > mlprev + eps # print 's={:.2f}\th={}\tN={}'.format(s, h, i.sum()); sys.stdout.flush() if i.sum() == 0: break mle[i] = s ml[i] = mlcurrent[i] return pd.DataFrame([ml, mle], index=['alt', 's'], columns=cd.index).T
reload(utl) pplt.GenomeChromosomewise(utl.scanGenomeSNP(utl.zpvalgenome2tail(s))) scores.sort_values() pplt.GenomeChromosomewise(utl.scanGenomeSNP(scores.abs(),lambda x: x[x>=x.quantile(0.5)].sum())) df=pd.concat([scores,s],1);df=pd.concat([df,df.rank()],1,keys=['val','rank']).sort_values(('val','s')) dfy=pd.concat([df,y],1).dropna() dfy.sort_values(0) i=df.index[-1]; cdi=cdAll.loc[i];print cdi.unstack('REP');pplt.plotSiteReal(cdi) cdiun=cdi.unstack('REP') CD,E=dta.precomputeCDandEmissionsFor(pd.DataFrame(cdi).T) h=0.5 reload(mkv) mkv.computeLikelihoodReal((CD, E, 0, 0.5)) likes=pd.concat(map(lambda x:mkv.computeLikelihoodReal((CD, E, x, 0.5)),S),keys=S).reset_index().iloc[:,[0,-1]].set_index('level_0')[0] likes[0] reload(pplt) plt.figure(figsize=(6,3),dpi=150);plt.subplot(1,2,1);pd.DataFrame(likes).plot(ax=plt.gca());plt.subplot(1,2,2);pplt.plotSiteReal(cdi,ax=plt.gca());print cdi.unstack('REP') res=res.reset_index().iloc[:,[0,3]];res=res.set_index(res.columns[0]).iloc[:,0] NN=np.arange(100,1500,100) def likelihoodWithDifferentN(N=1000,s=0): T=mkv.computePowerForSandSaveRealData((s,0.5),N=N,save=False) CD,E=dta.precomputeCDandEmissionsFor(pd.DataFrame(cdi).T,N=N) return computeLikelihoodReal((CD,E,T)).rename(N) E z2=pd.concat(map(lambda x: likelihoodWithDifferentN(s=x),S),keys=S)