def runHMM(h, stepS=0.05, eps=1e-1,CD=None,E=None,save=True,verbose=1): if CD is None: CD = pd.read_pickle(utl.outpath + 'real/CDEidx.df').iloc[:] if E is None: E = pd.read_pickle(utl.outpath + 'real/Emissions.df') likes_null = getNullLikelihoods(CD,E) likes_thn = mkv.computeLikelihoodReal((CD, E, -stepS, h)) likes_thp = mkv.computeLikelihoodReal((CD[likes_null > likes_thn], E, stepS, h)); neg = likes_thn[likes_null <= likes_thn]; zero = likes_null.loc[(likes_null.loc[likes_thp.index] >= likes_thp).replace({False: None}).dropna().index]; pos = likes_thp.loc[(likes_null.loc[likes_thp.index] < likes_thp).replace({False: None}).dropna().index]; if verbose>0: print 'N={}\t Null={} ({:.0f}\%)\t Pos={}\t Neg={}'.format(CD.shape[0], zero.size, zero.size / float(CD.shape[0]) * 100, pos.size, neg.size); sys.stdout.flush() dfz = pd.DataFrame(zero.values, index=zero.index, columns=['alt']); dfz['s'] = 0 dfn = findML(neg, -stepS, CD.loc[neg.index], E, h, eps, stepS) dfp = findML(pos, stepS, CD.loc[pos.index], E, h, eps,stepS) df = pd.concat([dfp, dfz, dfn]) df = pd.concat([df, likes_null], axis=1) df.columns = pd.MultiIndex.from_product([[h], df.columns], names=['h', 'stat']) if save: path = utl.outpath + 'real/HMM/' utl.mkdir(path) df.to_pickle(path + 'h{:E}.df'.format(h)) return df
def getNullLikelihoods(CD,E): # print 'Computing NullLikelihoods...'; sys.stdout.flush() # null_filename = utl.outpath + 'real/HMM/null.df' likes_null = mkv.computeLikelihoodReal((CD, E, 0, 0.5)) likes_null.name = 'null' return likes_null
def getNullLikelihoods37(): # print 'Computing NullLikelihoods...'; sys.stdout.flush() CD = pd.read_pickle(utl.outpath + 'real/CDEidx.df') CD=CD.loc[:,CD.columns.get_level_values('GEN')!=59] i=pd.read_csv('/home/arya/workspace/CLEAR/sample_data/popoolation2/F37.sync',sep='\t',header=None).set_index([0,1]).sort_index().iloc[:100] CD=CD.loc[i.index].sort_index() E = pd.read_pickle(utl.outpath + 'real/Emissions.df'); # null_filename = utl.outpath + 'real/HMM/null.df' likes_null = mkv.computeLikelihoodReal((CD, E, 0, 0.5)) likes_null.name = 'null' return likes_null
def SNPscan(R,regAlpha,numProcess): reload(dta) SH=dta.getSH(sparse=True) ARGS=[(R,)+sh for sh in SH] print pd.DataFrame(SH) print R if numProcess==1: likelihoods=pd.concat(map(mkv.computeLikelihoodReal,ARGS),axis=1);likelihoods.columns.names=['s','h'] else: pool=Pool(numProcess) likelihoods=pd.concat(pool.map(mkv.computeLikelihoodReal,ARGS),axis=1);likelihoods.columns.names=['s','h'] pool.terminate() del ARGS gc.collect() likelihoods=mkv.maxLikelihood(likelihoods,regAlpha=regAlpha) gc.collect() return likelihoods
def findML(init, init_s, cd, E, h, eps,stepS=0.05): S = np.arange(0, np.sign(init_s) * 1.0001, np.sign(init_s) * stepS)[2:] i = pd.Series(True, index=init.index).values; mlprev = init.values.copy(True); mlcurrent = init.values.copy(True) mle = np.ones(mlcurrent.size) * init_s; ml = init.values.copy(True) for s in S: mlprev[i] = mlcurrent[i] mlcurrent[i] = mkv.computeLikelihoodReal((cd[i], E, s, h)) i = mlcurrent > mlprev + eps # print 's={:.2f}\th={}\tN={}'.format(s, h, i.sum()); sys.stdout.flush() if i.sum() == 0: break mle[i] = s ml[i] = mlcurrent[i] return pd.DataFrame([ml, mle], index=['alt', 's'], columns=cd.index).T
def createData(s): T=mkv.Markov.computeTransition(s=s, N=1000, takeLog=True).astype(np.float128).apply(np.exp).apply(lambda x: x/x.sum(),axis=1) T2=T.dot(T) T4=T2.dot(T2) T8=T4.dot(T2) T10=T8.dot(T2);T100=T10.dot(T10) stable=pd.Series([T,T10,T100],index=[1,10,100]).apply(lambda x: x.applymap(np.log)) naive=pd.Series([mkv.Markov.computeTransition(s=s, N=1000, takeLog=True),mkv.computePowerSimulations(s=s,n=10,save=False),mkv.computePowerSimulations(s=s,n=10,save=False)],index=[1,10,100]) data={'naive':naive,'stable':stable} if s==0: pd.to_pickle(data,utl.outpath+'real/stablity.neutral.pkl') else: pd.to_pickle(data,utl.outpath+'real/stablity.selection.pkl')
def computeEmissions(mypath=utl.simoutpath + "TimeSeries/simpop/"): print "computing emissions..." E = [] depths = [30, 100, 300] for depth in depths: cd = [] for f in [f for f in os.listdir(mypath) if os.path.isfile(os.path.join(mypath, f))]: sim = pd.read_pickle(mypath + f) print f cd += [ pd.concat( [pd.Series(sim.C.loc[depth].reshape(-1)), pd.Series(sim.D.loc[depth].reshape(-1))], axis=1 ).drop_duplicates() ] cd = pd.concat(cd).drop_duplicates() cd = cd.apply(lambda x: (x[0], x[1]), axis=1) cd.index = index = pd.MultiIndex.from_tuples(cd.values, names=["c", "d"]) nu = pd.Series(np.arange(0, 1.0000001, 1.0 / (2.0 * sim.N)), index=np.arange(0, 1.0000001, 1.0 / (2.0 * sim.N))) a = cd.apply(lambda x: mkv.getStateLikelihoods(x, nu)).sort_index() E += [a] pd.Series(E, index=depths).to_pickle(utl.outpath + "markov/Emissions.df")
def likelihoodWithDifferentN(N=1000,s=0): T=mkv.computePowerForSandSaveRealData((s,0.5),N=N,save=False) CD,E=dta.precomputeCDandEmissionsFor(pd.DataFrame(cdi).T,N=N) return computeLikelihoodReal((CD,E,T)).rename(N)
reload(utl) pplt.GenomeChromosomewise(utl.scanGenomeSNP(utl.zpvalgenome2tail(s))) scores.sort_values() pplt.GenomeChromosomewise(utl.scanGenomeSNP(scores.abs(),lambda x: x[x>=x.quantile(0.5)].sum())) df=pd.concat([scores,s],1);df=pd.concat([df,df.rank()],1,keys=['val','rank']).sort_values(('val','s')) dfy=pd.concat([df,y],1).dropna() dfy.sort_values(0) i=df.index[-1]; cdi=cdAll.loc[i];print cdi.unstack('REP');pplt.plotSiteReal(cdi) cdiun=cdi.unstack('REP') CD,E=dta.precomputeCDandEmissionsFor(pd.DataFrame(cdi).T) h=0.5 reload(mkv) mkv.computeLikelihoodReal((CD, E, 0, 0.5)) likes=pd.concat(map(lambda x:mkv.computeLikelihoodReal((CD, E, x, 0.5)),S),keys=S).reset_index().iloc[:,[0,-1]].set_index('level_0')[0] likes[0] reload(pplt) plt.figure(figsize=(6,3),dpi=150);plt.subplot(1,2,1);pd.DataFrame(likes).plot(ax=plt.gca());plt.subplot(1,2,2);pplt.plotSiteReal(cdi,ax=plt.gca());print cdi.unstack('REP') res=res.reset_index().iloc[:,[0,3]];res=res.set_index(res.columns[0]).iloc[:,0] NN=np.arange(100,1500,100) def likelihoodWithDifferentN(N=1000,s=0): T=mkv.computePowerForSandSaveRealData((s,0.5),N=N,save=False) CD,E=dta.precomputeCDandEmissionsFor(pd.DataFrame(cdi).T,N=N) return computeLikelihoodReal((CD,E,T)).rename(N) E z2=pd.concat(map(lambda x: likelihoodWithDifferentN(s=x),S),keys=S)