Ejemplo n.º 1
0
Archivo: LD.py Proyecto: airanmehr/bio
def plotScalingFactor():
    r=2*1e-8
    l = 5e4
    dpi = 300
    j = 0
    for nu0 in [0.005, 0.1]:
        for s in [0.025, 0.1]:
            t = np.arange(0, 2 * (utl.logit(0.995) - utl.logit(nu0)) / s + 1., 1)
            fig, ax = plt.subplots(2, 1, figsize=(5.5, 2.5), dpi=dpi, sharex=True);
            nu(t, s=s, nu0=nu0).plot(color='k', legend=False, ax=ax[0])
            pplt.annotate(r'$s$={}, $\nu_0=${} ({} Sweep)'.format(s, nu0, ('Soft', 'Hard')[nu0 == 0.005]), fontsize=7,
                          ax=ax[0])
            pplt.setSize(ax=ax[0], fontsize=6)
            ax[0].set_ylabel(r'$\nu_t$')
            #
            H0 = H(t[0], s=s, nu0=nu0)
            Ht = H(t, s=s, nu0=nu0)
            df = pd.DataFrame([np.log(Ht / H0), -2 * r * t * l], columns=t, index=['log(Growth)', r'log(Decay)']).T
            df['log(Growth) + log(Decay)'] = df.sum(1)
            df.plot(ax=ax[1], grid=True, linewidth=2);
            ax[1].set_xlabel('Generations');
            ax[1].set_ylabel('Log(Scaling Factor)')
            ax[1].axvline(df.iloc[1:, 2].abs().idxmin(), color='k', linestyle='--', linewidth=0.5)
            # if j != 3:
            #     ax[1].legend_.remove()
            # else:
            ax[1].legend(['log(Growth)', r'log(Decay)', 'log(Growth) + log(Decay)'], bbox_to_anchor=(1.45, .75),
                         prop={'size': 6})
            pplt.setSize(ax[1], fontsize=6)

            plt.tight_layout(pad=0.1, rect=[0, 0, 0.7, 1])
            plt.gcf().subplots_adjust(bottom=0.15)
            pplt.savefig('decayFactors{}'.format(j), dpi=dpi)
            j += 1
Ejemplo n.º 2
0
def runHMM(h, stepS=0.05, eps=1e-1,CD=None,E=None,save=True,verbose=1):
    if CD is None:  CD = pd.read_pickle(utl.outpath + 'real/CDEidx.df').iloc[:]
    if E is None:   E = pd.read_pickle(utl.outpath + 'real/Emissions.df')
    likes_null = getNullLikelihoods(CD,E)
    likes_thn = mkv.computeLikelihoodReal((CD, E, -stepS, h))

    likes_thp = mkv.computeLikelihoodReal((CD[likes_null > likes_thn], E, stepS, h));
    neg = likes_thn[likes_null <= likes_thn];
    zero = likes_null.loc[(likes_null.loc[likes_thp.index] >= likes_thp).replace({False: None}).dropna().index];
    pos = likes_thp.loc[(likes_null.loc[likes_thp.index] < likes_thp).replace({False: None}).dropna().index];
    if verbose>0:
        print 'N={}\t Null={} ({:.0f}\%)\t Pos={}\t Neg={}'.format(CD.shape[0], zero.size,
                                                               zero.size / float(CD.shape[0]) * 100,
                                                               pos.size, neg.size);
    sys.stdout.flush()

    dfz = pd.DataFrame(zero.values, index=zero.index, columns=['alt']);
    dfz['s'] = 0
    dfn = findML(neg, -stepS, CD.loc[neg.index], E, h, eps, stepS)
    dfp = findML(pos, stepS, CD.loc[pos.index], E, h, eps,stepS)

    df = pd.concat([dfp, dfz, dfn])
    df = pd.concat([df, likes_null], axis=1)
    df.columns = pd.MultiIndex.from_product([[h], df.columns], names=['h', 'stat'])
    if save:
        path = utl.outpath + 'real/HMM/'
        utl.mkdir(path)
        df.to_pickle(path + 'h{:E}.df'.format(h))
    return df
Ejemplo n.º 3
0
Archivo: Run.py Proyecto: airanmehr/bio
def Power(method, depthRate, nu0, s, numReplicates=3, samplingWindow=50, L=50000, numExperiments=500, numProcess=4):
    param = {'numExperiments': numExperiments, 'method': method, 'numThreads': numProcess, 'ModelName': 'TimeSeries',
             'samplingWindow': samplingWindow, 'L': L, 'numReplicates': numReplicates, 'depthRate': depthRate}
    print  '\nMethod={}\tR={}\twin={}\tnu0={}\ts={}, depthRate={}'.format(method, numReplicates, samplingWindow, nu0, s,
                                                                          depthRate)
    sys.stdout.flush()
    if method in ['CMH', 'HMM'] and depthRate == np.inf: return
    if not s and nu0 == 0.1: return
    param['nu0'] = nu0
    param['s'] = s
    params = getParamsForExperiments(param)
    if numProcess == 1:
        a = map(runOne, params)
    else:
        pool = Pool(numProcess)
        a = pool.map(runOne, params)
        pool.terminate()
    gc.collect()
    df = pd.concat(a)
    sys.stdout.flush()
    df.sortlevel(inplace=True)
    df.dropna(axis=1, how='all', inplace=True)
    print df
    outpath = utl.outpath + 'ROC/runs/'
    utl.mkdir(outpath)
    df.to_pickle('{}{}.{:.0f}.{:E}.{:E}.df'.format(outpath, method, depthRate, nu0, s))
Ejemplo n.º 4
0
def computePowerForSandSaveRealData(sh, NumericallyStable=False, TakeLog=False, N = 1000,save=True):
    def computeTs(T):
        T2=T.dot(T).astype(float)
        T3=T2.dot(T)
        T4=T2.dot(T2)
        T5=T3.dot(T2)
        T10=T5.dot(T5)
        T12=T10.dot(T2)
        T14=T4.dot(T10)
        T15=T5.dot(T10)
        T22=T12.dot(T10)
        T23=T22.dot(T)
        if TakeLog:
            return pd.Series(map(utl.numbaLog, [T10, T12, T14, T15, T22, T23]), index=[10, 12, 14, 15, 22, 23])
        else:
            return pd.Series([T10, T12, T14, T15, T22, T23], index=[10, 12, 14, 15, 22, 23])
    s,h=sh
    path='{}transition/real/'.format(utl.outpath)
    utl.mkdir(path)
    fname = '{}S{:E}.H{:E}.df'.format(path, np.round(s, 2), h)
      # number of diploids
    # T = Markov.computeTransition(s, N, h=h, takeLog=True) #OLD NUMERICALLY STABLE
    # T=T.apply(lambda x: x-x.max(),axis=1).astype(np.float128).apply(np.exp).apply(lambda x: x/x.sum(),axis=1)
    # Tn=computeTs(T)
    T = Markov.computeTransition(s, N, h=h, takeLog=False)
    Tn=computeTs(T)
    zero = (0, -np.inf)[TakeLog]
    print 'Computed power for s={}, h={}'.format(s, h) + '  Number of zero prob transitions:', (
                                                                                               Tn.iloc[-1] == zero).sum(
        1).iloc[1:-1].sum()
    if save:
        Tn.to_pickle(fname)
    else:
        return Tn
    gc.collect()
Ejemplo n.º 5
0
def computeStatistics():
    cols = pd.MultiIndex.from_tuples(
        map(lambda x: (x[0], int(x[1])), ' C1      C2      C3      H1      H2      H3      L1      L2      L3'.split()),
        names=['POP', 'REP'])
    a = pd.read_csv(path + 'tot.snp.ref.freqs', sep='\t', header=None, index_col=range(4),
                    names=['CHROM', 'POS', 'REF', 'ALT'] + range(9))
    a.columns = cols
    pairwise = pd.concat([((a[a.columns[i]] + a[a.columns[j]]) / 2).rename(
        ''.join(map(str, a.columns[i])) + ''.join(map(str, a.columns[j]))) for i in range(a.shape[1]) for j in
                          range(i + 1, a.shape[1])], axis=1)
    pairwise.to_pickle(path + 'pairwise.population.df')

    reload(est)

    def unroll(all):
        all = pd.concat([all.applymap(lambda x: x[k]) for k in all.iloc[0, 0].keys()], keys=all.iloc[0, 0].keys(),
                        axis=1)
        all.columns.names = ['STAT'] + list(all.columns.names[1:])
        return all

    single = unroll(a.groupby(level=[0], axis=1).apply(
        lambda x: utl.scanGenome(x, f=lambda x: est.Estimate.getEstimate(x, n=200, method='all'))[x.name]))
    single.to_pickle(path + 'single.df')
    pairwise = unroll(pairwise.groupby(level=[0], axis=1).apply(
        lambda x: utl.scanGenome(x, f=lambda x: est.Estimate.getEstimate(x, n=400, method='all'))[x.name]))
    pairwise.to_pickle(path + 'pairwise.df')
Ejemplo n.º 6
0
def plotQuantile(df, kde):
    import Util as utl
    quantiles = np.sort(np.append(np.linspace(0.0, 1, 1000)[:-1], np.linspace(0.999, 1, 10)))
    qq = pd.concat([utl.getQantilePvalues(df.COMALE, kde, quantiles=quantiles),
                    utl.getQantilePvalues(df.COMALENC, kde, quantiles=quantiles)], axis=1);
    qq.columns = ['data', 'null'];
    QQPval(qq, fname=utl.paperFiguresPath + 'qq.pdf')
Ejemplo n.º 7
0
Archivo: QQ.py Proyecto: airanmehr/bio
def real():
    G = pd.read_pickle(utl.outpath + 'real/real.replicates.uptoF59.maxLikelihoods.regularized.LowCovRemoved.df');
    G = G.s * (G.alt - G.null)
    R = pd.read_pickle(utl.outpath + 'real/real.replicates.uptoF59.df');
    F=pd.read_pickle(utl.outpath+'real/negativeControl.Simulations.maxLikelihoods.regularized.df');F=F.s*(F.alt-F.null)
    kde=utl.getDensity(F,width=100)
    q = np.sort(np.append(np.linspace(0.0, 1, 100)[:-1], np.linspace(0.999, 1, 1000)))
    qq=pd.concat([utl.getQantilePvalues(G,kde,quantiles=q),utl.getQantilePvalues(F,kde,quantiles=q)],axis=1);qq.columns=['data','null'];
    pplt.QQPval(qq, fname=utl.paperFiguresPath + 'qq.pdf')
    reload(pplt)
Ejemplo n.º 8
0
def runOne(args):
    path = utl.outpath + 'markov/simulations/'
    utl.mkdir(path)
    numExp = int(1e5)
    nu0, s = args
    print nu0, s
    for i, batch in enumerate(utl.batch(range(numExp), 10000)):
        print;
        print i, batch[0], batch[-1]
        a = pd.concat(map(lambda x: Simulation.simulateSingleLoci(nu0=nu0, s=s)[[1, 10, 100]], batch), axis=1).T
        a.to_pickle(path + 'nu{:E}.s{:E}.{}.df'.format(nu0, s, i))
Ejemplo n.º 9
0
def plotSNPPval(out):
    scores = rutl.loadScores()
    kde = utl.getDensity(scores, width=1);
    pval = utl.getPvalKDE(out.sort_values(ascending=False).iloc[:1200], kde)
    print pval.sort_values()
    pval[pval >= 3].size
    df = pd.DataFrame(pval)
    df = pd.concat([df[df.index.get_level_values('CHROM') == ch] for ch in
                    ['X', '2L', '2R', '3L', '3R', '4', '2LHet', '2RHet', '3LHet', '3RHet', 'XHet']])
    fig = plt.figure(figsize=(7, 2), dpi=300);
    pplt.Manhattan(df, fig=fig, markerSize=2, ticksize=8, sortedAlready=True);
    [pplt.setSize(ax, 8) for ax in fig.get_axes()]
Ejemplo n.º 10
0
def outlier():
    scores = rutl.removeHeteroChromatin(rutl.loadScores())
    field = comale;
    df = sort(utl.scanGenome(scores.abs(), {field: lambda x: x.abs().mean(), 'Num. of SNPs': lambda x: x.size}))[
        [field, 'Num. of SNPs']]
    a = df.iloc[:, 0]
    a = a.rename('Global Outliers');
    o = a[a > a.quantile(0.99)]
    o.to_pickle(utl.outpath + 'real/outliers.global.df')
    fig = plt.figure(figsize=(7, 1.5), dpi=300);
    pplt.Manhattan(data=a, Outliers=pd.DataFrame(o), fig=fig, markerSize=2, ticksize=8, sortedAlready=True);
    [pplt.setSize(ax, 5) for ax in fig.get_axes()];
    plt.gcf().subplots_adjust(bottom=0.15);
    plt.savefig(utl.paperPath + 'new/{}.pdf'.format('global'))

    a = a.rename('Chrom Outliers');
    o = a.groupby(level=0).apply(lambda x: x[x > x.quantile(0.99)].loc[x.name])
    o.to_pickle(utl.outpath + 'real/outliers.chrom.df')
    fig = plt.figure(figsize=(7, 1.5), dpi=300);
    pplt.Manhattan(data=a, Outliers=pd.DataFrame(o), fig=fig, markerSize=2, ticksize=8, sortedAlready=True);
    [pplt.setSize(ax, 5) for ax in fig.get_axes()]
    plt.gcf().subplots_adjust(bottom=0.15);
    plt.savefig(utl.paperPath + 'new/{}.pdf'.format('chrom'))

    a = a.rename('Local Outliers');
    o = localOutliers(a)
    o.to_pickle(utl.outpath + 'real/outliers.local.df')
    fig = plt.figure(figsize=(7, 1.5), dpi=300);
    pplt.Manhattan(data=a, Outliers=pd.DataFrame(o), fig=fig, markerSize=2, ticksize=8, sortedAlready=True);
    [pplt.setSize(ax, 5) for ax in fig.get_axes()]
    plt.gcf().subplots_adjust(bottom=0.15);
    plt.savefig(utl.paperPath + 'new/{}.pdf'.format('local'))
Ejemplo n.º 11
0
def scanSFS():
    scores = rutl.loadScores()
    field = comale;
    df = sort(utl.scanGenome(scores.abs(), {field: lambda x: x.abs().mean(), 'Num. of SNPs': lambda x: x.size}))[
        [field, 'Num. of SNPs']]
    plotOne(df, df[df[field] > df[field].quantile(0.99)], fname='all')
    nu0 = rutl.getNut(0)
    nut = rutl.getNut(59)
    reload(rutl)
    # n= int(pd.read_pickle(utl.outpath + 'real/CD.F59.df').loc[:,pd.IndexSlice[:,0,'D']].mean().mean())
    n = 100
    SFSelect = lambda x: est.Estimate.getEstimate(x=x, method='SFSelect', n=n)

    sf0 = scanOne(nu0, SFSelect, 'SFSelect.Base', 'SFSelect.Base');

    SFSelect = lambda x: est.Estimate.getEstimate(x=x, method='SFSelect', n=n)
    sft = scanOne(nut, SFSelect, 'SFSelect.Final', 'SFSelect.Final')

    sfr = pd.concat(
            [(sft.iloc[:, 0] - sf0.iloc[:, 0]).rename('SFS(59)-SFS(0)'), sf0.iloc[:, 0], sft.iloc[:, 0], df.iloc[:, 0]],
            axis=1)
    outlier = sfr[sfr.iloc[:, 0] > sfr.iloc[:, 0].quantile(0.99)]
    sfr.loc[(sfr.iloc[:, 0] < 0).values, sfr.columns[0]] = None
    fig = plt.figure(figsize=(7, 4.5), dpi=300);
    pplt.Manhattan(data=sfr, Outliers=outlier, fig=fig, markerSize=2, ticksize=8, sortedAlready=True)
    [pplt.setSize(ax, 5) for ax in fig.get_axes()]
    plt.savefig(utl.paperPath + 'new/{}.pdf'.format('sfs-clear'))
Ejemplo n.º 12
0
def computeIntervals(minSize=500):
    scores = pd.read_pickle(utl.outpath + 'real/scores.df')
    scores = (scores.lrh * scores.sh.apply(np.sign)).rename('H')
    regions = utl.scanGenome(scores.abs(), {'H': lambda x: x.abs().mean()}, minSize=minSize, winSize=50000).H
    regions = regions[regions > regions.quantile(0.99)]
    regions = utl.BED.getIntervals(regions, 25000)
    return regions
Ejemplo n.º 13
0
def computeLikelihoodRealCDold(args):
    """
    Args: (it's more convenient for multiprocessing)
        args: a list of [R,s,h].
        R: is a dataframe for which each row is a position and columns are allele frequencies.
            ColumnsLevels= [REP, TIME] , IndexLevels=[CHROM,POS]
        s: is selection strength
        h: is overdominance
    Returns:
        a series containing likelihood of timeseries for the specific values of s and h.
    """
    CD, E, s, h, regLambda = args
    print CD.shape, s, h
    if CD.shape[0] > 4 * 1e5:
        numBatches = 5
        idx = np.arange(CD.shape[0])
        return pd.concat(
                map(lambda x: computeLikelihoodRealCDold((CD.iloc[x], E, s, h, regLambda)),
                    np.array_split(idx, numBatches)))
    powers = pd.Series(pd.Series(CD[r].columns).diff().values[1:] for r in range(3))
    T = pd.read_pickle(utl.outpath + 'transition/real/S{:02.0f}.H{:02.0f}.df'.format(s * 100, h * 100))
    likes = pd.Series(0, index=CD.index, name=(s, h))
    for rep, df in CD.T.groupby(level=0):
        alpha = E.loc[df.loc[(rep, 0)]]
        for step, power in zip(range(1, df.shape[0]), powers[rep]):
            alpha = alpha.values.dot(T.loc[power].values) * E.loc[df.loc[rep].iloc[step]]
        likes += utl.vectorizedLog(alpha.mean(1).values)
    return likes - regLambda * abs(s)
Ejemplo n.º 14
0
def computeComale(name='h50.df', recompute=False, q=0.99):
    path = utl.outpath + 'real/HMM/h50.COMALE.df'
    if not os.path.exists(path) or recompute:
        df = pd.read_pickle(utl.outpath + 'real/HMM/' + name)[0.5]
        df['lr'] = (df.alt - df.null) * df.s
        null = df.copy(True)
        np.random.shuffle(null.values)
        fcomale = {'COMALE': lambda x: x[x >= x.quantile(q)].mean(), 'M': lambda x: x.size};
        alt = utl.scanGenome(df.lr, fcomale, minSize=200)
        null = utl.scanGenome(null.lr, fcomale, minSize=200);
        null.columns = ['COMALENC', 'M']
        alt = pd.concat([null.COMALENC, alt], axis=1)
        alt.to_pickle(path)
        return alt
    else:
        return pd.read_pickle(path)
Ejemplo n.º 15
0
def createOneMSMS(param, forceToHaveSoftFreq):
    theta = 2 * param["Ne"] * param["mu"] * param["L"]
    rho = 2 * param["Ne"] * param["r"] * param["L"]
    path = "{}{}/msms/".format(utl.simoutpath, param["ModelName"])
    utl.mkdir(path)
    if isinstance(param["i"], (int, float, long)):
        filename = "{}L{:E}.{:E}.msms".format(path, param["L"], param["i"])
    else:
        filename = "{}L{:E}.{}.msms".format(path, param["L"], param["i"])
    cmd = "java -jar -Xmx2g ~/bin/msms/lib/msms.jar -ms {} 1 -t {:.0f} -r {:.0f} {:.0f} -oFP 0.000000000000E00 > {}".format(
        param["n"], theta, rho, param["L"], filename
    )
    subprocess.call(cmd, shell=True)
    if (
        forceToHaveSoftFreq and not (Simulation.MSMS.load(filename)[0].mean(0) == 0.1).sum()
    ):  # make sure inital freq 0.1 exist
        createOneMSMS(param)
Ejemplo n.º 16
0
def scanSFS(XX, winSize=10000):
    import popgen.Estimate as est

    return (
        XX.apply(lambda x: utl.scanGenome(x.dropna(), uf=est.Estimate.getAllEstimatesX, winSize=winSize))
        .unstack("method")
        .stack(["POP", "GEN"])
    )
Ejemplo n.º 17
0
def loadAllScores(h=None, scores=True):
    path = utl.outpath + 'real/HMM/'
    if h is None:
        return pd.concat(map(lambda x: pd.read_pickle(path + x), utl.files(path)), axis=1)
    else:
        a = pd.read_pickle('{}h{:E}.df'.format(path, h))[h]
        if scores:
            a = (a.alt - a.null) * a.s.apply(np.sign)
        return a
Ejemplo n.º 18
0
Archivo: Run.py Proyecto: airanmehr/bio
 def one(method):
     ff = lambda x: ((x.alt - x.null) * x.s.apply(np.sign)).fillna(0).sort_index()
     path = utl.outpath + 'ROC/runs/'
     files = pd.Series(utl.files(path))
     files = files[files.apply(lambda x: method in x)]
     if method == 'MarkovChain':
         pd.concat([ff(pd.read_pickle(path + f)) for f in files]).to_pickle(utl.outpath + 'ROC/' + method)
     else:
         pd.concat([pd.read_pickle(path + f) for f in files]).to_pickle(utl.outpath + 'ROC/' + method)
Ejemplo n.º 19
0
def saveAnnotationUCSC():
    utl.BED.saveBEDGraph(utl.loadSNPID()["ID"], color="0,0,0", name="dbSNP", fout_name=path + "dbSNP")
    ann = (
        pd.read_csv(utl.home + "storage/Data/Dmelanogaster/Hypoxia/popsss/all.ANN.csv", sep="\t")
        .set_index(["CHROM", "POS"])
        .loc[IH.replace({False: None}).dropna().index]
    )
    ann = ann.iloc[:, :3].reset_index().drop_duplicates().set_index(["CHROM", "POS"])
    ann = ann.Annotation + "(" + ann.REF + ">" + ann.Allele + ")"
    utl.BED.saveBEDGraph(ann, color="0,0,0", name="SNP effect", fout_name=path + "UCSC/effectSNP")
Ejemplo n.º 20
0
def computeLikelihoodRealBatch(args):
    CD, E, T, powers = args
    likes = pd.Series(0, index=CD.index)
    for rep, df in CD.T.groupby(level=0):
        alpha = E.iloc[df.loc[(rep, 0)]].values
        for step, power in zip(range(1, df.shape[0]), powers[rep]):
            alpha = alpha.dot(T.loc[power].values) * E.values[df.loc[rep].iloc[step].values]
            #likes += utl.vectorizedLog(alpha.mean(1))
        likes += utl.vectorizedLog(alpha.mean(1)) #it should be here
    return likes
Ejemplo n.º 21
0
def saveLatex():
    for name in [x for x in utl.files('/home/arya/out/real/gowinda/') if x[-4:] == '.tsv']:
        # name='cand.local.damped.0.out.tsv'
        a = pd.read_csv('/home/arya/out/real/gowinda/' + name, sep='\t', header=None)[[0, 4, 5, 6, 7, 8, 9]]
        a.columns = ['GO', '-logPval', 'Hits', 'VarGenes', 'TotGenes', 'Term', 'Genes']
        a = a[a.Hits >= 3]
        a['-logPval'] = -a['-logPval'].apply(np.log10).round(1)
        a['Genes'] = a['Genes'].apply(lambda x: x.replace(',', '          '))
        utl.DataframetolaTexTable(a.iloc[:, 1:], fname=utl.paperPath + 'new/' + name.replace('.tsv', '.tex'),
                                  alignment=list('cccc') + ['p{2in}', 'p{2in}'])
Ejemplo n.º 22
0
def Final():
    scores = rutl.loadScores(skipHetChroms=True).abs()
    a = sort(utl.scanGenome(scores.abs(), {'H': lambda x: x.abs().mean(), 'M': lambda x: x.size}))
    intervals = ga.getIntervals(o.H, padding=30000)
    fig = plt.figure(figsize=(7, 1.5), dpi=300);
    pplt.Manhattan(data=a, Outliers=o, shade=intervals.reset_index(), fig=fig, markerSize=2, ticksize=8,
                   sortedAlready=True);
    [pplt.setSize(ax, 5) for ax in fig.get_axes()];
    plt.gcf().subplots_adjust(bottom=0.15);
    plt.suptitle((shades.shape[0], shades['len'].sum() / 1e6), fontsize=8)
    plt.savefig(utl.paperPath + 'new/{}.pdf'.format('CHROM.FDR_0.01'))
Ejemplo n.º 23
0
 def computeTransition(s, N, h=0.5, takeLog=False,nu0_N=None):
     if nu0_N is None:nu0=np.arange(2*N+1)/float(2*N)
     else: nu0=np.arange(2*nu0_N+1)/float(2*nu0_N)
     nu_t = map(lambda x: max(min(utl.fx(x, s, h=h), 1.), 0.), nu0)
     if takeLog:
         # T=pd.DataFrame(computeLogTransition(nu_t,N),index=nu0,columns=nu0) figure out normilzartion
         pass
     else:
         T=pd.DataFrame(computeTransition(nu_t,N),index=nu0,columns=nu0)
         if not nu0_N is None:
             T=T/T.sum(1)
     return T
Ejemplo n.º 24
0
def computeBaseSFS(recompute=False):
    path = utl.outpath + 'real/SFS.F0.df'
    if not os.path.exists(path) or recompute:
        x0 = dta.getBaseFreq()
        import popgen.Estimate as est
        sfs = utl.scanGenome(x0, lambda x: est.Estimate.getEstimate(x=x, n=1000, method='all',
                                                                    selectionPredictor=True)).apply(
            lambda x: pd.Series(x[0]), axis=1)
        sfs.to_pickle(path)
        return sfs
    else:
        return pd.read_pickle(path)
Ejemplo n.º 25
0
def computeLocalPval(x,i):
    wins=np.array([200])*1000
    df=[]
    for i in X.index:
        res=[]
        for pad in wins:
            x=X[(X.index>=i-pad) & (X.index<=i+pad)]
            kde=utl.getDensity(x[x.index != i])
            res+=[utl.getPvalKDE(pd.Series(x.loc[i]),kde)[0]]
        df+=[pd.Series(res,index=wins,name=i)]
    df=pd.DataFrame(df)
    pd.concat([df.apply(lambda x:x.idxmax(),1),df.max(1)],1).plot.scatter(x=0,y=1)
    a['pval']=df.max(1).values
    o=a[a.pval>a.pval.quantile(0.999)]

    pplt.Manhattan(a,Outliers=o)

    df.max(1).plot()

    y=utl.scan3way(x,winsize=10,f=np.mean)
    x.sort_values()
    y.sort_values()
Ejemplo n.º 26
0
def computeIntervalsBED(padding=25000, cutoff=0.9999):
    path = utl.outpath + 'real/HMM/h50.COMALE.df'
    df = pd.read_pickle(path)
    df = df[df.COMALE > df.COMALENC.quantile(cutoff)].COMALE.reset_index()
    df['start'] = df.POS - padding
    df['end'] = df.POS + padding
    df['name'] = '.'
    df = df[['CHROM', 'start', 'end', 'name', 'COMALE']]
    df = utl.mergeIntervals(df)
    df
    df.to_csv(utl.outpath + 'real/intervals.bed', sep='\t', header=None, index=None)
    df['len'] = df.end - df.start
    df.to_pickle(utl.outpath + 'real/intervals.df')
Ejemplo n.º 27
0
    def load(ExperimentName, s=0.1, L=50000, experimentID=0, nu0=0.005, isFolded=False, All=False, startGeneration=0,
             maxGeneration=50, numReplicates=3, numSamples=5, step=10, replicates=None, depthRate=np.inf):
        path='{}{}/simpop/'.format(utl.simoutpath, ExperimentName) + Simulation.getSimulationName(s=s, L=L, experimentID=experimentID, initialCarrierFreq=nu0, isFolded=isFolded) + '.pkl'
        sim= pd.read_pickle(path)
        sim.savedPath=path
        if replicates is not None:          sim.setReplicates(sorted(replicates))
        elif numReplicates is not None:     sim.setReplicates(range(numReplicates))

        if depthRate != np.inf:
            sim.Xi = sim.X
            sim.X = sim.C.loc[depthRate] / sim.D.loc[depthRate].astype(float)
            sim.X = np.array(map(lambda x: utl.roundto(x, 5), sim.X.reshape(-1) * 1e4)).reshape(sim.X.shape) / 1e4

        if not All: sim.setSamplingTimes(maxGeneration=min(maxGeneration,sim.getGenerationTimes()[-1]),numSamples=numSamples,step=step,startGeneration=startGeneration)
        return sim
Ejemplo n.º 28
0
Archivo: Run.py Proyecto: airanmehr/bio
def PowerForDepth(method, depthRate, numReplicates=3, samplingWindow=50, L=50000, numExperiments=500, numProcess=4):
    df = [];
    Nu = [0.005, 0.1];
    S = [.025, 0.05, 0.075, 0.1]
    param = {'numExperiments': numExperiments, 'method': method, 'numThreads': numProcess, 'ModelName': 'TimeSeries',
             'samplingWindow': samplingWindow, 'L': L, 'numReplicates': numReplicates, 'depthRate': depthRate}
    print 'Nu={}\tS={}\tnumThreads={}\tmethod={}\tnumExperiments={}'.format(Nu, S, numProcess, method, numExperiments)
    sys.stdout.flush()
    if method == 'HMM' and depthRate == np.inf: return

    for nu0 in Nu:
        param['nu0'] = nu0
        for s in S:
            param['s'] = s
            params = getParamsForExperiments(param)
            if numProcess == 1:
                a = map(runOne, params)
            else:
                pool = Pool(numProcess)
                a = pool.map(runOne, params)
                pool.terminate()
            gc.collect()
            df += [pd.concat(a)]
            print  '\nMethod={}\tR={}\twin={}\tnu0={}\ts={}, depthRate={}'.format(method, numReplicates,
                                                                                  samplingWindow, nu0, s, depthRate)
            sys.stdout.flush()
    for param in params: param['s'] = 0;param['nu0'] = 0.005
    pool = Pool(numProcess)
    df += [pd.concat(pool.map(runOne, params))]
    df=pd.concat(df)
    df.sortlevel(inplace=True)
    df.dropna(axis=1,how='all',inplace=True)
    print df
    outpath = utl.outpath + 'ROC/'
    utl.mkdir(outpath)
    df.to_pickle('{}{}.{:.0f}.df'.format(outpath, method, depthRate))
Ejemplo n.º 29
0
def gowinda():
    gow = pd.read_csv(utl.outpath + 'real/gowinda/cand.q99.out', sep='\t', header=None)
    arya = np.array("""GO:0004046
    GO:0015101
    GO:0007501
    GO:0004601
    GO:0006979
    GO:0009312
    GO:0004653
    GO:0040014
    GO:0016485
    GO:0006030
    GO:0020037
    GO:0008061
    GO:0004702""".split())
    np.intersect1d(gow[0].unique().astype(str), arya).shape
    Genes = pd.read_pickle(utl.outpath + 'real/GO.df')
    pval, cont = utl.getPvalFisher(Genes.reset_index().GO.unique(), gow[0], arya)
Ejemplo n.º 30
-1
Archivo: QQ.py Proyecto: airanmehr/bio
def Simulation():
    a=pd.read_pickle('{}ROC/{}.df'.format(utl.outpath, 'COMALE'));a=a.s*(a.alt-a.null);
    pos=a.loc[(0.1,'COMALE',0.1,1,0)];neg=a.loc[(0.005,'COMALE',0.0,-1,0)]
    F=pd.read_pickle(utl.outpath+'real/negativeControl.Simulations.maxLikelihoods.regularized.df').loc[0];F=F.s*(F.alt-F.null)
    q=np.linspace(0,1,1200)
    kde=utl.getDensity(F,width=50)
    qq=pd.concat([utl.getQantilePvalues(pos,kde,quantiles=q),utl.getQantilePvalues(neg,kde,quantiles=q)],axis=1);qq.columns=['data','null'];
    pplt.QQPval(qq)
    plt.savefig(utl.paperFiguresPath + 'qqsim.pdf')