Python Utils Beispiele, popgen.Run.TimeSeries.RealData.Utils Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: Manhattan.py Projekt: airanmehr/bio

def scanSFS():
    scores = rutl.loadScores()
    field = comale;
    df = sort(utl.scanGenome(scores.abs(), {field: lambda x: x.abs().mean(), 'Num. of SNPs': lambda x: x.size}))[
        [field, 'Num. of SNPs']]
    plotOne(df, df[df[field] > df[field].quantile(0.99)], fname='all')
    nu0 = rutl.getNut(0)
    nut = rutl.getNut(59)
    reload(rutl)
    # n= int(pd.read_pickle(utl.outpath + 'real/CD.F59.df').loc[:,pd.IndexSlice[:,0,'D']].mean().mean())
    n = 100
    SFSelect = lambda x: est.Estimate.getEstimate(x=x, method='SFSelect', n=n)

    sf0 = scanOne(nu0, SFSelect, 'SFSelect.Base', 'SFSelect.Base');

    SFSelect = lambda x: est.Estimate.getEstimate(x=x, method='SFSelect', n=n)
    sft = scanOne(nut, SFSelect, 'SFSelect.Final', 'SFSelect.Final')

    sfr = pd.concat(
            [(sft.iloc[:, 0] - sf0.iloc[:, 0]).rename('SFS(59)-SFS(0)'), sf0.iloc[:, 0], sft.iloc[:, 0], df.iloc[:, 0]],
            axis=1)
    outlier = sfr[sfr.iloc[:, 0] > sfr.iloc[:, 0].quantile(0.99)]
    sfr.loc[(sfr.iloc[:, 0] < 0).values, sfr.columns[0]] = None
    fig = plt.figure(figsize=(7, 4.5), dpi=300);
    pplt.Manhattan(data=sfr, Outliers=outlier, fig=fig, markerSize=2, ticksize=8, sortedAlready=True)
    [pplt.setSize(ax, 5) for ax in fig.get_axes()]
    plt.savefig(utl.paperPath + 'new/{}.pdf'.format('sfs-clear'))

Beispiel #2

0

Datei anzeigen

Datei: Manhattan.py Projekt: airanmehr/bio

def outlier():
    scores = rutl.removeHeteroChromatin(rutl.loadScores())
    field = comale;
    df = sort(utl.scanGenome(scores.abs(), {field: lambda x: x.abs().mean(), 'Num. of SNPs': lambda x: x.size}))[
        [field, 'Num. of SNPs']]
    a = df.iloc[:, 0]
    a = a.rename('Global Outliers');
    o = a[a > a.quantile(0.99)]
    o.to_pickle(utl.outpath + 'real/outliers.global.df')
    fig = plt.figure(figsize=(7, 1.5), dpi=300);
    pplt.Manhattan(data=a, Outliers=pd.DataFrame(o), fig=fig, markerSize=2, ticksize=8, sortedAlready=True);
    [pplt.setSize(ax, 5) for ax in fig.get_axes()];
    plt.gcf().subplots_adjust(bottom=0.15);
    plt.savefig(utl.paperPath + 'new/{}.pdf'.format('global'))

    a = a.rename('Chrom Outliers');
    o = a.groupby(level=0).apply(lambda x: x[x > x.quantile(0.99)].loc[x.name])
    o.to_pickle(utl.outpath + 'real/outliers.chrom.df')
    fig = plt.figure(figsize=(7, 1.5), dpi=300);
    pplt.Manhattan(data=a, Outliers=pd.DataFrame(o), fig=fig, markerSize=2, ticksize=8, sortedAlready=True);
    [pplt.setSize(ax, 5) for ax in fig.get_axes()]
    plt.gcf().subplots_adjust(bottom=0.15);
    plt.savefig(utl.paperPath + 'new/{}.pdf'.format('chrom'))

    a = a.rename('Local Outliers');
    o = localOutliers(a)
    o.to_pickle(utl.outpath + 'real/outliers.local.df')
    fig = plt.figure(figsize=(7, 1.5), dpi=300);
    pplt.Manhattan(data=a, Outliers=pd.DataFrame(o), fig=fig, markerSize=2, ticksize=8, sortedAlready=True);
    [pplt.setSize(ax, 5) for ax in fig.get_axes()]
    plt.gcf().subplots_adjust(bottom=0.15);
    plt.savefig(utl.paperPath + 'new/{}.pdf'.format('local'))

Beispiel #3

0

Datei anzeigen

Datei: GeneAnalysis.py Projekt: airanmehr/bio

def saveTopKSNPs():
    scores = rutl.loadScores()
    ann = loadANN()["Annotation Annotation_Impact               Gene_Name      Gene_ID".split()]
    scores = pd.concat([scores, rutl.loadSNPIDs()], axis=1).set_index('ID', append=True)[0].rename('Hstatistic')
    top = scores[scores > scores.quantile(0.9999)].reset_index('ID').join(rutl.getNut(0), how='inner')
    top = top.join(ann).drop_duplicates().sort_values('Hstatistic', ascending=False)
    top = top[top['Annotation_Impact'] != 'LOW']
    top.to_csv(utl.outpath + 'real/top_1e-4_quantile_SNPs.csv')

Beispiel #4

0

Datei anzeigen

Datei: GeneAnalysis.py Projekt: airanmehr/bio

def findCandidateSNPs(outlierMode='local'):
    print  'running ', outlierMode
    ann = loadANN()["Annotation Annotation_Impact               Gene_Name      Gene_ID".split()]
    intervals = utl.BED.getIntervals(pd.read_pickle(utl.outpath + 'real/outliers.{}.df'.format(outlierMode)), padding=25000)
    scores = rutl.removeHeteroChromatin(rutl.loadScores()).rename('H')
    scores.shape
    candidates = []
    for _, row in intervals.iterrows():
        row = pd.DataFrame(row).T
        row.index.name = 'CHROM'
        snp = utl.BED.intersection(scores.reset_index(), row, 'H').rename(columns={'name': 'H', 'start': 'POS'})[
            ['POS', 'H']].set_index('POS', append=True)['H'].astype(float)
        snp = snp[snp > snp.quantile(0.99)]
        candidates += [snp]
        print snp.shape, row.iloc[0].loc['len']

    candidates = pd.DataFrame(pd.concat(candidates)).join(ann, how='inner')
    candidates = candidates[candidates['Annotation_Impact'] != 'LOW']
    candidates.to_pickle(utl.outpath + 'real/{}.df'.format('cand.' + outlierMode))

Beispiel #5

0

Datei anzeigen

Datei: Manhattan.py Projekt: airanmehr/bio

def Final():
    scores = rutl.loadScores(skipHetChroms=True).abs()
    a = sort(utl.scanGenome(scores.abs(), {'H': lambda x: x.abs().mean(), 'M': lambda x: x.size}))
    intervals = ga.getIntervals(o.H, padding=30000)
    fig = plt.figure(figsize=(7, 1.5), dpi=300);
    pplt.Manhattan(data=a, Outliers=o, shade=intervals.reset_index(), fig=fig, markerSize=2, ticksize=8,
                   sortedAlready=True);
    [pplt.setSize(ax, 5) for ax in fig.get_axes()];
    plt.gcf().subplots_adjust(bottom=0.15);
    plt.suptitle((shades.shape[0], shades['len'].sum() / 1e6), fontsize=8)
    plt.savefig(utl.paperPath + 'new/{}.pdf'.format('CHROM.FDR_0.01'))

Beispiel #6

0

Datei anzeigen

Datei: Manhattan.py Projekt: airanmehr/bio

def plotSNPPval(out):
    scores = rutl.loadScores()
    kde = utl.getDensity(scores, width=1);
    pval = utl.getPvalKDE(out.sort_values(ascending=False).iloc[:1200], kde)
    print pval.sort_values()
    pval[pval >= 3].size
    df = pd.DataFrame(pval)
    df = pd.concat([df[df.index.get_level_values('CHROM') == ch] for ch in
                    ['X', '2L', '2R', '3L', '3R', '4', '2LHet', '2RHet', '3LHet', '3RHet', 'XHet']])
    fig = plt.figure(figsize=(7, 2), dpi=300);
    pplt.Manhattan(df, fig=fig, markerSize=2, ticksize=8, sortedAlready=True);
    [pplt.setSize(ax, 8) for ax in fig.get_axes()]

Beispiel #7

0

Datei anzeigen

Datei: GeneAnalysis.py Projekt: airanmehr/bio

def createGwandaDataNew():
    def save(df, name='candidatesnps.txt'):
        df.sort_index().reset_index().to_csv(utl.outpath + 'real/gowinda/{}.txt'.format(name), sep='\t', header=None,
                                             index=False)

    scores = rutl.removeHeteroChromatin(rutl.loadScores()).rename('H')
    save(scores, 'allsnps')
    for outlierMode in ['local', 'global', 'chrom']:
        a = pd.read_pickle(utl.outpath + 'real/{}.df'.format('cand.' + outlierMode))['H'].reset_index().drop_duplicates(
            subset=['CHROM', 'POS']).set_index(['CHROM', 'POS']).H
        save(a, 'cand.' + outlierMode + '.damped.0')
        print a.shape

    ann = loadANN()["Annotation Annotation_Impact               Gene_Name      Gene_ID".split()]

    for dampn in [100, 500, 1000, 2000]:
        damp = scores.sort_values(ascending=False).iloc[:dampn]
        damp = pd.DataFrame(damp).join(ann, how='inner')
        damp = \
        damp[damp['Annotation_Impact'] != 'LOW']['H'].reset_index().drop_duplicates().set_index(['CHROM', 'POS'])['H']
        damp.shape

        for outlierMode in ['local', 'global', 'chrom']:
            a = pd.read_pickle(utl.outpath + 'real/{}.df'.format('cand.' + outlierMode))[
                'H'].reset_index().drop_duplicates(subset=['CHROM', 'POS']).set_index(['CHROM', 'POS']).H
            a = pd.concat([a, damp]).reset_index().drop_duplicates(subset=['CHROM', 'POS']).set_index(
                    ['CHROM', 'POS']).H
            save(a, 'cand.' + outlierMode + '.damped.{}'.format(dampn))
            print a.shape

    Genes = pd.read_pickle(utl.outpath + 'real/GO.df').set_index('GO')
    Genes = Genes[Genes.AnnID.apply(lambda x: x[:2] == 'CG')]
    Genes.AnnID.value_counts()
    df = pd.concat([Genes.term.drop_duplicates(), Genes.AnnID.groupby(level=0).apply(lambda x: ' '.join(x.tolist()))],
                   axis=1)
    df.to_csv(utl.outpath + 'real/gowinda/goassociation.CG', sep='\t', header=None)

    df = pd.concat([Genes.term.drop_duplicates(), Genes.FBgn.groupby(level=0).apply(lambda x: ' '.join(x.tolist()))],
                   axis=1)
    df.to_csv(utl.outpath + 'real/gowinda/goassociation.FBgn', sep='\t', header=None)

Beispiel #8

0

Datei anzeigen

Datei: workspace.py Projekt: airanmehr/bio

def estimateS(y):
    eps=1e-3
    y[0]=y[0].apply(lambda x: min(1-eps,max(x,eps)))
    y[59]=y[59].apply(lambda x: min(1-eps,max(x,eps)))
    s=(2./59 * (utl.logit(y[59])-utl.logit(y[0]))).rename('s')
    return s

import popgen.Run.TimeSeries.RealData.Utils as rutl
import scipy as sc
import popgen.Plots as pplt
import popgen.Run.TimeSeries.RealData.Utils as rutl
import popgen.Run.TimeSeries.RealData.Data as dta
import popgen.TimeSeries.Markov as mkv
S=np.arange(-1,1,0.05).round(2);chroms=['2L','2R','3L','3R','X']
pd.read_pickle('/home/arya/out/real/HMM1x/h5.000000E-01.df').loc[i]
scores = utl.getEuChromatin(rutl.loadScores(skipHetChroms=True)).loc[chroms].rename('score')
cdAll=utl.getEuChromatin(pd.read_pickle('/home/arya/out/real/CD.F59.df').loc[chroms])
freq=lambda x:x.xs('C',level='READ',axis=1).sum(1)/x.xs('D',level='READ',axis=1).sum(1)
s=estimateS(cdAll.groupby(axis=1,level='GEN').apply(freq)[[0,37,59]])

x=pd.read_pickle('/home/arya/out/real/HMM1x/h5.000000E-01.df').loc[chroms,0.5]
pplt.Manhattan(utl.zpvalgenome(utl.scanGenome(utl.zpvalgenome2tail(s))))
(x.s*(x.alt-x.null)).hist(bins=100)
D=cdAll.xs('D',axis=1,level='READ')
d=D.median(1).rename('d')
f=lambda x:(x.alt-x.null)
pplt.Manhattan(utl.scanGenome(x2p(f(x))))
x2p=lambda X2: -pd.Series(1 - sc.stats.chi2.cdf(X2, 1),index=X2.index).apply(np.log)
y=(f(pd.read_pickle('/home/arya/out/real/HMM/h5.000000E-01.df')[0.5]).loc[chroms].rename('y')*pd.read_pickle('/home/arya/out/real/HMM/h5.000000E-01.df')[0.5].s).dropna()
y.sort_values()
y=utl.zpvalgenome(pd.read_pickle('/home/arya/out/real/HMM/h5.000000E-01.df')[0.5].s.loc[chroms])

Beispiel #9

0

Datei anzeigen

Datei: Run.py Projekt: airanmehr/bio

def replicatesSanityCheck():
    a = pd.read_csv(utl.home + 'BF37.head', header=None, sep='\t').iloc[:, [0, 1, -1]].set_index([0, 1]).sort_index().iloc[:,0]
    cd=pd.read_pickle('/home/arya/out/real/CD.F59.df').loc[a.index,pd.IndexSlice[:,[0,37]]]
    print (a-utl.CMHcd(cd,damp=0,negLog10=False,eps=0)).abs().sum()


    a = pd.read_csv(utl.home + 'BF15.head', header=None, sep='\t').iloc[:, [0, 1, -1]].set_index([0, 1]).sort_index().iloc[:,0]
    cd=pd.read_pickle('/home/arya/out/real/CD.F59.df').loc[a.index]
    cd=cd.groupby(level=[0],axis=1).apply(lambda x: x.iloc[:,:4]).T.dropna().reset_index()
    cd.GEN=cd.GEN.replace(23,15)
    cd=cd.set_index(['REP','GEN','READ']).T
    print (a-utl.CMHcd(cd,damp=0,negLog10=False,eps=0)).abs().sum()

if __name__ == '__main__':
    start=time()
    # dta.createF37VCF()
    #dta.computeTransitions()
    # dta.precomputeCDandEmissions()
    # dta.computeF37()
    # dta.computeF59(
    # )
    # options.h=0.5
    rutl.runHMM(options.h)
    #rutl.scanCMH()
    # rutl.computeScores()

    # ga.computeGeneRankings()
    print '\nDone in {:.1f} secs.'.format(time()-start)

Beispiel #10

0

Datei anzeigen

Datei: topSNPs.py Projekt: airanmehr/bio

np.set_printoptions(linewidth=200, precision=5, suppress=True)
import pandas as pd;

pd.options.display.max_rows = 20;
pd.options.display.expand_frame_repr = False
import seaborn as sns
import pylab as plt;
import matplotlib as mpl
import os;

home = os.path.expanduser('~') + '/'
import popgen.Util as utl
import popgen.Estimate as est
import popgen.Run.TimeSeries.RealData.Utils as rutl

a = rutl.loadAllScores().groupby(level='h', axis=1).apply(rutl.HstatisticAll)
df = pd.read_pickle(utl.outpath + 'real/scores.df')
i = df.lrd.sort_values().index[-1]
df.loc[i]

cd = pd.read_pickle(utl.outpath + 'real/CD.F59.df')

import popgen.Plots as pplt
import pylab as plt

names = rutl.loadSNPIDs()
sns.set_style("white", {"grid.color": "0.9", 'axes.linewidth': .5, "grid.linewidth": "9.99"})
mpl.rc('font', **{'family': 'serif', 'serif': ['Computer Modern']});
mpl.rc('text', usetex=True)
reload(pplt)
f, ax = plt.subplots(1, 2, sharey=True, dpi=300, figsize=(4, 2))

Beispiel #11

0

Datei anzeigen

Datei: Manhattan.py Projekt: airanmehr/bio

def analyzie(minsize=500, winSize=50 * 1000):
    scores = rutl.loadScores()
    df = sort(utl.scanGenome(scores.abs(), {comale: lambda x: x.abs().mean(), 'Num. of SNPs': lambda x: x.size},
                             minSize=minsize))[[comale, 'Num. of SNPs']]
    outlier = df[df[comale] > df[comale].quantile(0.99)]
    plotOne(df, outlier, fname='manhattan.min500');

Beispiel #12

0

Datei anzeigen

Datei: Manhattan.py Projekt: airanmehr/bio

def scanSFSSNPbased():
    scores = rutl.loadScores(skipHetChroms=True)
    # field = comale;
    # df = sort(utl.scanGenome(scores.abs(), {field: lambda x: x.abs().mean(), 'Num. of SNPs': lambda x: x.size}))[
    #     [field, 'Num. of SNPs']]
    # plotOne(df, df[df[field] > df[field].quantile(0.99)], fname='all')
    reload(rutl)
    reload(pplt)
    reload(utl)
    # SFSelect = lambda x: est.Estimate.getEstimate(x=x, method='SFSelect', n=100)
    # sfs0 = utl.scanGenomeSNP(rutl.getNut(0, skipHetChroms=True), SFSelect)
    # sfst = utl.scanGenomeSNP(rutl.getNut(59, skipHetChroms=True), SFSelect).rename(59);     sfs=(sfst-sfs0);    sfs[sfs<0]=None
    g = ga.loadGeneCoordinates().set_index('name')
    genes = g.loc[['Ace', 'Cyp6g1', 'CHKov1']].reset_index().set_index('CHROM')

    shade = scores.sort_values().reset_index().iloc[-2:].rename(columns={'POS': 'start'});
    shade['end'] = shade.start + 100
    cand = pd.concat([scores, scores.rank(ascending=False).rename('rank'), rutl.getNut(0, skipHetChroms=True)],
                     axis=1).sort_values('rank')
    chroms = ['2L', '2R', '3L', '3R']
    reload(utl)

    # reload(pplt);pplt.Genome(sfs.loc[chroms],genes=genes);plt.tight_layout(pad=0.1)
    df = pd.concat(
            [utl.scanGenomeSNP(scores.abs(), lambda x: x.mean(), winSize=200, step=100, skipFromFirst=900).rename(200),
             utl.scanGenomeSNP(scores.abs(), lambda x: x.mean(), winSize=500, step=100, skipFromFirst=750).rename(500),
             utl.scanGenomeSNP(scores.abs(), lambda x: x.mean(), winSize=1000, step=100, skipFromFirst=500).rename(
                 1000)], axis=1)
    df['comb'] = df[200] * df[500] * df[1000]

    fig = plt.figure(figsize=(7, 4.5), dpi=300);
    pplt.Manhattan(data=sort(df.rename(columns={'comb': '200*500*1000'})), fig=fig, markerSize=2, ticksize=8,
                   sortedAlready=True);
    [pplt.setSize(ax, 5) for ax in fig.get_axes()];
    plt.gcf().subplots_adjust(bottom=0.15);
    plt.savefig(utl.paperPath + 'new/{}.pdf'.format('SNPbased'))
    pplt.Genome(df.comb);
    plt.tight_layout(pad=0.1)

    # analyzie()
    # scanSFS()
    # outlier()
    # scanSFSSNPbased()
    a = df.comb
    o = localOutliers(a, q=0.9);
    fig = plt.figure(figsize=(7, 1.5), dpi=300);
    pplt.Manhattan(data=a, Outliers=pd.DataFrame(o), fig=fig, markerSize=2, ticksize=8, sortedAlready=True);
    [pplt.setSize(ax, 5) for ax in fig.get_axes()];
    plt.gcf().subplots_adjust(bottom=0.15);
    plt.savefig(utl.paperPath + 'new/{}.pdf'.format('SNPbased.candidates'))

    Scores = pd.concat([scores.rename('scores').abs(), scores.groupby(level=0).apply(
        lambda x: pd.Series(range(x.size), index=x.loc[x.name].index)).rename('i')], axis=1)
    cutoff = FDR(o, Scores);

    a = pd.concat([df, cutoff[cutoff.sum(1) > 0]], axis=1).dropna();
    for fdr in [0.95, 0.99, 0.999]:
        o = a[a.comb > a[fdr]]
        fig = plt.figure(figsize=(7, 1.5), dpi=300);
        pplt.Manhattan(data=df.comb, Outliers=pd.DataFrame(o), fig=fig, markerSize=2, ticksize=8, sortedAlready=True);
        [pplt.setSize(ax, 5) for ax in fig.get_axes()];
        plt.gcf().subplots_adjust(bottom=0.15);
        plt.savefig(utl.paperPath + 'new/{}.pdf'.format('SNPbased.fdr{}'.format(fdr)))

Beispiel #13

0

Datei anzeigen

Datei: GeneAnalysis.py Projekt: airanmehr/bio

def Final():
    ############ preparing data
    def saveGOTex(df):
        name = np.unique(df.index)[0]
        print '*' * 80, name
        df = df.sort_values('-log($p$-value)', ascending=False)
        df['Rank'] = range(1, df.shape[0] + 1);
        df = df.iloc[:, [6] + range(6)]
        path = utl.paperPath + '/tables/{}.tex'.format(name);
        df.to_csv(path.replace('.tex', '.csv').replace('/tables/', '/data/'))
        utl.DataframetolaTexTable(df.iloc[:, :-1], alignment=['c', 'c', 'p{3in}', 'c', 'c', 'c'], fname=path)

    goPvalue = lambda x: utl.getPvalFisher(AllGenes=allVariantGenes.values, putativeList=x.values,
                                           myList=g.index.values)
    unpackp = lambda x: [min(6, np.round(x[0], 1)), x[1].loc['Putative', 'myList']]
    # Score = lambda x,f:f(scores.loc[x.CHROM][(scores.loc[x.CHROM].index>=x.start)&(scores.loc[x.CHROM].index<=x.end)])
    sort = lambda df: pd.concat(
            [df[df.index.get_level_values('CHROM') == ch] for ch in ['X', '2L', '2R', '3L', '3R']]).rename(
        columns={'H': r'$\mathcal{H}^+$', 'M': 'Num. of Variants'})
    Genes = loadGeneData().reset_index().set_index('GO')
    Genes = Genes.loc[
        (Genes['FBgn'].groupby(level=0).apply(lambda x: len(x.unique())) > 2).replace({False: None}).dropna().index]
    scores = utl.getEuChromatin(rutl.loadScores(skipHetChroms=True))
    ann = pd.DataFrame(scores).join(loadANN(), how='inner')
    allVariantGenes = ann['Gene_ID'].drop_duplicates()
    # f=lambda x: x[x>=x.quantile(0.9)].mean()
    # geneScores=ann.reset_index().set_index('Gene_ID')[['CHROM','POS',0]].drop_duplicates().groupby(level=0)[0].apply(f)


    ############ computing candidate regions
    scan = utl.scanGenome(scores.abs(), {'H': lambda x: x.abs().mean(), 'M': lambda x: x.size}, winSize=30000)
    o = utl.localOutliers(scan.H, q=0.99);
    o = scan.loc[o.index]
    fig = plt.figure(figsize=(7, 2.5), dpi=300);
    pplt.Manhattan(data=sort(scan), Outliers=sort(o), fig=fig, markerSize=2, ticksize=8, sortedAlready=True);
    [pplt.setSize(ax, 6) for ax in fig.get_axes()];

    pplt.annotate('(A)', ax=fig.axes[0], fontsize=8)
    pplt.annotate('(B)', ax=fig.axes[1], fontsize=8)
    plt.gcf().subplots_adjust(bottom=0.15);
    pplt.savefig('manhattan', 300)
    plt.savefig(utl.paperFiguresPath + 'manhattan.pdf')
    regions = utl.BED.getIntervals(o.H, padding=30000);
    print regions.shape
    intervalGenes = utl.BED.intersection(ann, regions).name.drop_duplicates().reset_index().set_index('name');
    print intervalGenes.size
    g = intervalGenes;
    # intervalGenes
    # g=g[g>=g.quantile(0.)];
    print g.size
    df = Genes.groupby(level=0).apply(lambda x: pd.DataFrame(
        [x.name, x.term.iloc[0]] + unpackp(goPvalue(x.FBgn.drop_duplicates())) + [x.ontology.iloc[0],
                                                                                  x.FBgn.unique().size] + [
            np.intersect1d(x.values, g.index.values)],
        index=['GO ID', 'GO Term', '-log($p$-value)', 'Hits', 'Ontology', 'Num of Genes', 'Genes']).T)
    df = df[(df['-log($p$-value)'] >= 3) & (df.Hits >= 3)]
    df['-log($p$-value)'] = df['-log($p$-value)'].astype(str)
    df = df.set_index('Ontology')
    df.groupby(level=0).apply(saveGOTex);
    print df

    tempGenes = Genes.reset_index().set_index('FBgn').loc[
        np.append(df.set_index('GO ID').loc['GO:0009631'].Genes, df.set_index('GO ID').loc['GO:0009408'].Genes)][
        ['term', 'name', 'GO']].reset_index().set_index('GO').loc[['GO:0009631', 'GO:0009408']].drop_duplicates()
    tempGenes.columns = ['FlyBase ID', 'GO Term', 'Gene Name']
    utl.DataframetolaTexTable(tempGenes, fname=utl.paperPath + '/tables/{}.tex'.format('tempGenes'),
                              alignment=['l', 'l', 'l'])


    regions.to_csv(utl.paperPath + 'data/intervals.csv')

    snps = utl.BED.intersection(scores.reset_index(), regions, 0);
    snps['POS'] = snps.start;
    snps.set_index('POS', append=True, inplace=True)
    snps = snps['name'].astype(float).reset_index().drop_duplicates().set_index(['CHROM', 'POS']).name

    def ff(x):
        y = utl.BED.intersection(scores.reset_index(), x, 0).rename(columns={'start': 'POS'}).set_index('POS',
                                                                                                append=True).name.astype(
            float)
        y = y[y > 0]
        y = y[y >= y.quantile(0.9)]
        print x['len'].iloc[0], y.size
        return y

    cands = regions.reset_index().groupby(level=0).apply(ff).reset_index(level=0).name
    cands.sort_index().reset_index().drop_duplicates().dropna().to_csv(utl.outpath + 'real/gowinda/cands.final.txt',
                                                                       sep='\t', header=None, index=False)
    scores.sort_index().reset_index().drop_duplicates().dropna().to_csv(utl.outpath + 'real/gowinda/allsnps.txt',
                                                                        sep='\t', header=None, index=False)

    name = 'cands.final.out.tsv'
    gowinda = pd.read_csv('/home/arya/out/real/gowinda/{}'.format(name), sep='\t', header=None)[[0, 4, 5, 6, 7, 8, 9]]
    gowinda.columns = ['GO ID', '-log($p$-value)', 'Hits', 'Num of Genes', 'Total Genes', 'GO Term', 'Genes']
    gowinda = gowinda[gowinda.Hits >= 3]
    gowinda['-log($p$-value)'] = -gowinda['-log($p$-value)'].apply(np.log10).round(1)
    gowinda.to_csv(utl.paperPath + 'data/gowinda.all.tsv', sep='\t')
    bp = gowinda.set_index('GO ID').loc[
        Genes[Genes.ontology == 'biological_process'].index.unique().rename('GO ID')].dropna()
    bp.to_csv(utl.paperPath + 'data/gowinda.bp.tsv', sep='\t')
    utl.DataframetolaTexTable(bp.reset_index()[['GO ID', 'GO Term', '-log($p$-value)']], alignment=['c', 'p{4in}', 'c'],
                              fname=utl.paperPath + 'tables/gowinda.tex')

    map(len, (Genes.index.unique(), bp.index.unique(), df.loc['biological_process']['GO ID'].unique())), len(
        np.intersect1d(bp.index.unique(), df['GO ID'].unique()))

    pval = utl.getPvalFisher(Genes[Genes.ontology == 'biological_process'].index.unique(), bp.index.unique(),
                             df.loc['biological_process']['GO ID'].unique())
    print pval

    stats = pd.Series(None, name='Value')

    stats['Num. of Vatiants'] = scores.size
    stats['Num. of Candidate Intervals'] = regions.shape[0]
    stats['Total Num. of Genes'] = loadGeneCoordinates().shape[0]
    stats['Num. of Variant Genes'] = ann['Gene_ID'].unique().shape[0]
    stats['Num. of Genes within Candidate Intervals'] = intervalGenes.shape[0]
    stats['Total Num. of GO'] = len(loadGeneData().index.unique())
    stats['Num. of GO with 3 or More Genes'] = len(Genes.index.unique())
    stats['Num. of Candidate Variants for Gowinda'] = cands.size
    stats = stats.apply(lambda x: '{:,.0f}'.format(x))
    stats.index.name = 'Statistic'
    print stats
    utl.DataframetolaTexTable(stats.reset_index(), fname=utl.paperPath + 'tables/stats.tex', alignment=['l', 'r'])