コード例 #1
0
ファイル: Utils.py プロジェクト: airanmehr/bio
def runHMM(h, stepS=0.05, eps=1e-1,CD=None,E=None,save=True,verbose=1):
    if CD is None:  CD = pd.read_pickle(utl.outpath + 'real/CDEidx.df').iloc[:]
    if E is None:   E = pd.read_pickle(utl.outpath + 'real/Emissions.df')
    likes_null = getNullLikelihoods(CD,E)
    likes_thn = mkv.computeLikelihoodReal((CD, E, -stepS, h))

    likes_thp = mkv.computeLikelihoodReal((CD[likes_null > likes_thn], E, stepS, h));
    neg = likes_thn[likes_null <= likes_thn];
    zero = likes_null.loc[(likes_null.loc[likes_thp.index] >= likes_thp).replace({False: None}).dropna().index];
    pos = likes_thp.loc[(likes_null.loc[likes_thp.index] < likes_thp).replace({False: None}).dropna().index];
    if verbose>0:
        print 'N={}\t Null={} ({:.0f}\%)\t Pos={}\t Neg={}'.format(CD.shape[0], zero.size,
                                                               zero.size / float(CD.shape[0]) * 100,
                                                               pos.size, neg.size);
    sys.stdout.flush()

    dfz = pd.DataFrame(zero.values, index=zero.index, columns=['alt']);
    dfz['s'] = 0
    dfn = findML(neg, -stepS, CD.loc[neg.index], E, h, eps, stepS)
    dfp = findML(pos, stepS, CD.loc[pos.index], E, h, eps,stepS)

    df = pd.concat([dfp, dfz, dfn])
    df = pd.concat([df, likes_null], axis=1)
    df.columns = pd.MultiIndex.from_product([[h], df.columns], names=['h', 'stat'])
    if save:
        path = utl.outpath + 'real/HMM/'
        utl.mkdir(path)
        df.to_pickle(path + 'h{:E}.df'.format(h))
    return df
コード例 #2
0
ファイル: Utils.py プロジェクト: airanmehr/bio
def getNullLikelihoods(CD,E):
    # print 'Computing NullLikelihoods...';
    sys.stdout.flush()
    # null_filename = utl.outpath + 'real/HMM/null.df'
    likes_null = mkv.computeLikelihoodReal((CD, E, 0, 0.5))
    likes_null.name = 'null'
    return likes_null
コード例 #3
0
ファイル: Utils.py プロジェクト: airanmehr/bio
def getNullLikelihoods37():
    # print 'Computing NullLikelihoods...';
    sys.stdout.flush()
    CD = pd.read_pickle(utl.outpath + 'real/CDEidx.df')
    CD=CD.loc[:,CD.columns.get_level_values('GEN')!=59]
    i=pd.read_csv('/home/arya/workspace/CLEAR/sample_data/popoolation2/F37.sync',sep='\t',header=None).set_index([0,1]).sort_index().iloc[:100]
    CD=CD.loc[i.index].sort_index()
    E = pd.read_pickle(utl.outpath + 'real/Emissions.df');
    # null_filename = utl.outpath + 'real/HMM/null.df'
    likes_null = mkv.computeLikelihoodReal((CD, E, 0, 0.5))
    likes_null.name = 'null'
    return likes_null
コード例 #4
0
ファイル: Utils.py プロジェクト: airanmehr/bio
def SNPscan(R,regAlpha,numProcess):
    reload(dta)
    SH=dta.getSH(sparse=True)
    ARGS=[(R,)+sh for sh in SH]
    print pd.DataFrame(SH)
    print R
    if numProcess==1:
        likelihoods=pd.concat(map(mkv.computeLikelihoodReal,ARGS),axis=1);likelihoods.columns.names=['s','h']
    else:
        pool=Pool(numProcess)
        likelihoods=pd.concat(pool.map(mkv.computeLikelihoodReal,ARGS),axis=1);likelihoods.columns.names=['s','h']
        pool.terminate()
    del ARGS
    gc.collect()
    likelihoods=mkv.maxLikelihood(likelihoods,regAlpha=regAlpha)
    gc.collect()
    return likelihoods
コード例 #5
0
ファイル: Utils.py プロジェクト: airanmehr/bio
def findML(init, init_s, cd, E, h, eps,stepS=0.05):
    S = np.arange(0, np.sign(init_s) * 1.0001, np.sign(init_s) * stepS)[2:]
    i = pd.Series(True, index=init.index).values;
    mlprev = init.values.copy(True);
    mlcurrent = init.values.copy(True)
    mle = np.ones(mlcurrent.size) * init_s;
    ml = init.values.copy(True)
    for s in S:
        mlprev[i] = mlcurrent[i]
        mlcurrent[i] = mkv.computeLikelihoodReal((cd[i], E, s, h))
        i = mlcurrent > mlprev + eps
        # print 's={:.2f}\th={}\tN={}'.format(s, h, i.sum());
        sys.stdout.flush()
        if i.sum() == 0: break
        mle[i] = s
        ml[i] = mlcurrent[i]
    return pd.DataFrame([ml, mle], index=['alt', 's'], columns=cd.index).T
コード例 #6
0
ファイル: stablity.py プロジェクト: airanmehr/bio
def createData(s):
    T=mkv.Markov.computeTransition(s=s, N=1000, takeLog=True).astype(np.float128).apply(np.exp).apply(lambda x: x/x.sum(),axis=1)
    T2=T.dot(T)
    T4=T2.dot(T2)

    T8=T4.dot(T2)

    T10=T8.dot(T2);T100=T10.dot(T10)



    stable=pd.Series([T,T10,T100],index=[1,10,100]).apply(lambda x: x.applymap(np.log))
    naive=pd.Series([mkv.Markov.computeTransition(s=s, N=1000, takeLog=True),mkv.computePowerSimulations(s=s,n=10,save=False),mkv.computePowerSimulations(s=s,n=10,save=False)],index=[1,10,100])
    data={'naive':naive,'stable':stable}
    if s==0:
        pd.to_pickle(data,utl.outpath+'real/stablity.neutral.pkl')
    else:
        pd.to_pickle(data,utl.outpath+'real/stablity.selection.pkl')
コード例 #7
0
ファイル: createPool.py プロジェクト: airanmehr/bio
def computeEmissions(mypath=utl.simoutpath + "TimeSeries/simpop/"):
    print "computing emissions..."
    E = []
    depths = [30, 100, 300]
    for depth in depths:
        cd = []
        for f in [f for f in os.listdir(mypath) if os.path.isfile(os.path.join(mypath, f))]:
            sim = pd.read_pickle(mypath + f)
            print f
            cd += [
                pd.concat(
                    [pd.Series(sim.C.loc[depth].reshape(-1)), pd.Series(sim.D.loc[depth].reshape(-1))], axis=1
                ).drop_duplicates()
            ]
        cd = pd.concat(cd).drop_duplicates()
        cd = cd.apply(lambda x: (x[0], x[1]), axis=1)
        cd.index = index = pd.MultiIndex.from_tuples(cd.values, names=["c", "d"])
        nu = pd.Series(np.arange(0, 1.0000001, 1.0 / (2.0 * sim.N)), index=np.arange(0, 1.0000001, 1.0 / (2.0 * sim.N)))
        a = cd.apply(lambda x: mkv.getStateLikelihoods(x, nu)).sort_index()
        E += [a]
    pd.Series(E, index=depths).to_pickle(utl.outpath + "markov/Emissions.df")
コード例 #8
0
ファイル: workspace.py プロジェクト: airanmehr/bio
def likelihoodWithDifferentN(N=1000,s=0):
    T=mkv.computePowerForSandSaveRealData((s,0.5),N=N,save=False)
    CD,E=dta.precomputeCDandEmissionsFor(pd.DataFrame(cdi).T,N=N)
    return computeLikelihoodReal((CD,E,T)).rename(N)
コード例 #9
0
ファイル: workspace.py プロジェクト: airanmehr/bio
reload(utl)
pplt.GenomeChromosomewise(utl.scanGenomeSNP(utl.zpvalgenome2tail(s)))
scores.sort_values()
pplt.GenomeChromosomewise(utl.scanGenomeSNP(scores.abs(),lambda x: x[x>=x.quantile(0.5)].sum()))
df=pd.concat([scores,s],1);df=pd.concat([df,df.rank()],1,keys=['val','rank']).sort_values(('val','s'))
dfy=pd.concat([df,y],1).dropna()
dfy.sort_values(0)

i=df.index[-1];
cdi=cdAll.loc[i];print cdi.unstack('REP');pplt.plotSiteReal(cdi)
cdiun=cdi.unstack('REP')
CD,E=dta.precomputeCDandEmissionsFor(pd.DataFrame(cdi).T)
h=0.5
reload(mkv)

mkv.computeLikelihoodReal((CD, E, 0, 0.5))
likes=pd.concat(map(lambda x:mkv.computeLikelihoodReal((CD, E, x, 0.5)),S),keys=S).reset_index().iloc[:,[0,-1]].set_index('level_0')[0]
likes[0]

reload(pplt)
plt.figure(figsize=(6,3),dpi=150);plt.subplot(1,2,1);pd.DataFrame(likes).plot(ax=plt.gca());plt.subplot(1,2,2);pplt.plotSiteReal(cdi,ax=plt.gca());print cdi.unstack('REP')

res=res.reset_index().iloc[:,[0,3]];res=res.set_index(res.columns[0]).iloc[:,0]

NN=np.arange(100,1500,100)
def likelihoodWithDifferentN(N=1000,s=0):
    T=mkv.computePowerForSandSaveRealData((s,0.5),N=N,save=False)
    CD,E=dta.precomputeCDandEmissionsFor(pd.DataFrame(cdi).T,N=N)
    return computeLikelihoodReal((CD,E,T)).rename(N)
E
z2=pd.concat(map(lambda x: likelihoodWithDifferentN(s=x),S),keys=S)