Exemple #1
0
def runHMM(h, stepS=0.05, eps=1e-1,CD=None,E=None,save=True,verbose=1):
    if CD is None:  CD = pd.read_pickle(utl.outpath + 'real/CDEidx.df').iloc[:]
    if E is None:   E = pd.read_pickle(utl.outpath + 'real/Emissions.df')
    likes_null = getNullLikelihoods(CD,E)
    likes_thn = mkv.computeLikelihoodReal((CD, E, -stepS, h))

    likes_thp = mkv.computeLikelihoodReal((CD[likes_null > likes_thn], E, stepS, h));
    neg = likes_thn[likes_null <= likes_thn];
    zero = likes_null.loc[(likes_null.loc[likes_thp.index] >= likes_thp).replace({False: None}).dropna().index];
    pos = likes_thp.loc[(likes_null.loc[likes_thp.index] < likes_thp).replace({False: None}).dropna().index];
    if verbose>0:
        print 'N={}\t Null={} ({:.0f}\%)\t Pos={}\t Neg={}'.format(CD.shape[0], zero.size,
                                                               zero.size / float(CD.shape[0]) * 100,
                                                               pos.size, neg.size);
    sys.stdout.flush()

    dfz = pd.DataFrame(zero.values, index=zero.index, columns=['alt']);
    dfz['s'] = 0
    dfn = findML(neg, -stepS, CD.loc[neg.index], E, h, eps, stepS)
    dfp = findML(pos, stepS, CD.loc[pos.index], E, h, eps,stepS)

    df = pd.concat([dfp, dfz, dfn])
    df = pd.concat([df, likes_null], axis=1)
    df.columns = pd.MultiIndex.from_product([[h], df.columns], names=['h', 'stat'])
    if save:
        path = utl.outpath + 'real/HMM/'
        utl.mkdir(path)
        df.to_pickle(path + 'h{:E}.df'.format(h))
    return df
Exemple #2
0
def getNullLikelihoods(CD,E):
    # print 'Computing NullLikelihoods...';
    sys.stdout.flush()
    # null_filename = utl.outpath + 'real/HMM/null.df'
    likes_null = mkv.computeLikelihoodReal((CD, E, 0, 0.5))
    likes_null.name = 'null'
    return likes_null
Exemple #3
0
def getNullLikelihoods37():
    # print 'Computing NullLikelihoods...';
    sys.stdout.flush()
    CD = pd.read_pickle(utl.outpath + 'real/CDEidx.df')
    CD=CD.loc[:,CD.columns.get_level_values('GEN')!=59]
    i=pd.read_csv('/home/arya/workspace/CLEAR/sample_data/popoolation2/F37.sync',sep='\t',header=None).set_index([0,1]).sort_index().iloc[:100]
    CD=CD.loc[i.index].sort_index()
    E = pd.read_pickle(utl.outpath + 'real/Emissions.df');
    # null_filename = utl.outpath + 'real/HMM/null.df'
    likes_null = mkv.computeLikelihoodReal((CD, E, 0, 0.5))
    likes_null.name = 'null'
    return likes_null
Exemple #4
0
def findML(init, init_s, cd, E, h, eps,stepS=0.05):
    S = np.arange(0, np.sign(init_s) * 1.0001, np.sign(init_s) * stepS)[2:]
    i = pd.Series(True, index=init.index).values;
    mlprev = init.values.copy(True);
    mlcurrent = init.values.copy(True)
    mle = np.ones(mlcurrent.size) * init_s;
    ml = init.values.copy(True)
    for s in S:
        mlprev[i] = mlcurrent[i]
        mlcurrent[i] = mkv.computeLikelihoodReal((cd[i], E, s, h))
        i = mlcurrent > mlprev + eps
        # print 's={:.2f}\th={}\tN={}'.format(s, h, i.sum());
        sys.stdout.flush()
        if i.sum() == 0: break
        mle[i] = s
        ml[i] = mlcurrent[i]
    return pd.DataFrame([ml, mle], index=['alt', 's'], columns=cd.index).T
Exemple #5
0
reload(utl)
pplt.GenomeChromosomewise(utl.scanGenomeSNP(utl.zpvalgenome2tail(s)))
scores.sort_values()
pplt.GenomeChromosomewise(utl.scanGenomeSNP(scores.abs(),lambda x: x[x>=x.quantile(0.5)].sum()))
df=pd.concat([scores,s],1);df=pd.concat([df,df.rank()],1,keys=['val','rank']).sort_values(('val','s'))
dfy=pd.concat([df,y],1).dropna()
dfy.sort_values(0)

i=df.index[-1];
cdi=cdAll.loc[i];print cdi.unstack('REP');pplt.plotSiteReal(cdi)
cdiun=cdi.unstack('REP')
CD,E=dta.precomputeCDandEmissionsFor(pd.DataFrame(cdi).T)
h=0.5
reload(mkv)

mkv.computeLikelihoodReal((CD, E, 0, 0.5))
likes=pd.concat(map(lambda x:mkv.computeLikelihoodReal((CD, E, x, 0.5)),S),keys=S).reset_index().iloc[:,[0,-1]].set_index('level_0')[0]
likes[0]

reload(pplt)
plt.figure(figsize=(6,3),dpi=150);plt.subplot(1,2,1);pd.DataFrame(likes).plot(ax=plt.gca());plt.subplot(1,2,2);pplt.plotSiteReal(cdi,ax=plt.gca());print cdi.unstack('REP')

res=res.reset_index().iloc[:,[0,3]];res=res.set_index(res.columns[0]).iloc[:,0]

NN=np.arange(100,1500,100)
def likelihoodWithDifferentN(N=1000,s=0):
    T=mkv.computePowerForSandSaveRealData((s,0.5),N=N,save=False)
    CD,E=dta.precomputeCDandEmissionsFor(pd.DataFrame(cdi).T,N=N)
    return computeLikelihoodReal((CD,E,T)).rename(N)
E
z2=pd.concat(map(lambda x: likelihoodWithDifferentN(s=x),S),keys=S)