Exemple #1
0
def computeStatistics():
    cols = pd.MultiIndex.from_tuples(
        map(lambda x: (x[0], int(x[1])), ' C1      C2      C3      H1      H2      H3      L1      L2      L3'.split()),
        names=['POP', 'REP'])
    a = pd.read_csv(path + 'tot.snp.ref.freqs', sep='\t', header=None, index_col=range(4),
                    names=['CHROM', 'POS', 'REF', 'ALT'] + range(9))
    a.columns = cols
    pairwise = pd.concat([((a[a.columns[i]] + a[a.columns[j]]) / 2).rename(
        ''.join(map(str, a.columns[i])) + ''.join(map(str, a.columns[j]))) for i in range(a.shape[1]) for j in
                          range(i + 1, a.shape[1])], axis=1)
    pairwise.to_pickle(path + 'pairwise.population.df')

    reload(est)

    def unroll(all):
        all = pd.concat([all.applymap(lambda x: x[k]) for k in all.iloc[0, 0].keys()], keys=all.iloc[0, 0].keys(),
                        axis=1)
        all.columns.names = ['STAT'] + list(all.columns.names[1:])
        return all

    single = unroll(a.groupby(level=[0], axis=1).apply(
        lambda x: utl.scanGenome(x, f=lambda x: est.Estimate.getEstimate(x, n=200, method='all'))[x.name]))
    single.to_pickle(path + 'single.df')
    pairwise = unroll(pairwise.groupby(level=[0], axis=1).apply(
        lambda x: utl.scanGenome(x, f=lambda x: est.Estimate.getEstimate(x, n=400, method='all'))[x.name]))
    pairwise.to_pickle(path + 'pairwise.df')
Exemple #2
0
def computeComale(name='h50.df', recompute=False, q=0.99):
    path = utl.outpath + 'real/HMM/h50.COMALE.df'
    if not os.path.exists(path) or recompute:
        df = pd.read_pickle(utl.outpath + 'real/HMM/' + name)[0.5]
        df['lr'] = (df.alt - df.null) * df.s
        null = df.copy(True)
        np.random.shuffle(null.values)
        fcomale = {'COMALE': lambda x: x[x >= x.quantile(q)].mean(), 'M': lambda x: x.size};
        alt = utl.scanGenome(df.lr, fcomale, minSize=200)
        null = utl.scanGenome(null.lr, fcomale, minSize=200);
        null.columns = ['COMALENC', 'M']
        alt = pd.concat([null.COMALENC, alt], axis=1)
        alt.to_pickle(path)
        return alt
    else:
        return pd.read_pickle(path)
Exemple #3
0
def outlier():
    scores = rutl.removeHeteroChromatin(rutl.loadScores())
    field = comale;
    df = sort(utl.scanGenome(scores.abs(), {field: lambda x: x.abs().mean(), 'Num. of SNPs': lambda x: x.size}))[
        [field, 'Num. of SNPs']]
    a = df.iloc[:, 0]
    a = a.rename('Global Outliers');
    o = a[a > a.quantile(0.99)]
    o.to_pickle(utl.outpath + 'real/outliers.global.df')
    fig = plt.figure(figsize=(7, 1.5), dpi=300);
    pplt.Manhattan(data=a, Outliers=pd.DataFrame(o), fig=fig, markerSize=2, ticksize=8, sortedAlready=True);
    [pplt.setSize(ax, 5) for ax in fig.get_axes()];
    plt.gcf().subplots_adjust(bottom=0.15);
    plt.savefig(utl.paperPath + 'new/{}.pdf'.format('global'))

    a = a.rename('Chrom Outliers');
    o = a.groupby(level=0).apply(lambda x: x[x > x.quantile(0.99)].loc[x.name])
    o.to_pickle(utl.outpath + 'real/outliers.chrom.df')
    fig = plt.figure(figsize=(7, 1.5), dpi=300);
    pplt.Manhattan(data=a, Outliers=pd.DataFrame(o), fig=fig, markerSize=2, ticksize=8, sortedAlready=True);
    [pplt.setSize(ax, 5) for ax in fig.get_axes()]
    plt.gcf().subplots_adjust(bottom=0.15);
    plt.savefig(utl.paperPath + 'new/{}.pdf'.format('chrom'))

    a = a.rename('Local Outliers');
    o = localOutliers(a)
    o.to_pickle(utl.outpath + 'real/outliers.local.df')
    fig = plt.figure(figsize=(7, 1.5), dpi=300);
    pplt.Manhattan(data=a, Outliers=pd.DataFrame(o), fig=fig, markerSize=2, ticksize=8, sortedAlready=True);
    [pplt.setSize(ax, 5) for ax in fig.get_axes()]
    plt.gcf().subplots_adjust(bottom=0.15);
    plt.savefig(utl.paperPath + 'new/{}.pdf'.format('local'))
Exemple #4
0
def scanSFS():
    scores = rutl.loadScores()
    field = comale;
    df = sort(utl.scanGenome(scores.abs(), {field: lambda x: x.abs().mean(), 'Num. of SNPs': lambda x: x.size}))[
        [field, 'Num. of SNPs']]
    plotOne(df, df[df[field] > df[field].quantile(0.99)], fname='all')
    nu0 = rutl.getNut(0)
    nut = rutl.getNut(59)
    reload(rutl)
    # n= int(pd.read_pickle(utl.outpath + 'real/CD.F59.df').loc[:,pd.IndexSlice[:,0,'D']].mean().mean())
    n = 100
    SFSelect = lambda x: est.Estimate.getEstimate(x=x, method='SFSelect', n=n)

    sf0 = scanOne(nu0, SFSelect, 'SFSelect.Base', 'SFSelect.Base');

    SFSelect = lambda x: est.Estimate.getEstimate(x=x, method='SFSelect', n=n)
    sft = scanOne(nut, SFSelect, 'SFSelect.Final', 'SFSelect.Final')

    sfr = pd.concat(
            [(sft.iloc[:, 0] - sf0.iloc[:, 0]).rename('SFS(59)-SFS(0)'), sf0.iloc[:, 0], sft.iloc[:, 0], df.iloc[:, 0]],
            axis=1)
    outlier = sfr[sfr.iloc[:, 0] > sfr.iloc[:, 0].quantile(0.99)]
    sfr.loc[(sfr.iloc[:, 0] < 0).values, sfr.columns[0]] = None
    fig = plt.figure(figsize=(7, 4.5), dpi=300);
    pplt.Manhattan(data=sfr, Outliers=outlier, fig=fig, markerSize=2, ticksize=8, sortedAlready=True)
    [pplt.setSize(ax, 5) for ax in fig.get_axes()]
    plt.savefig(utl.paperPath + 'new/{}.pdf'.format('sfs-clear'))
Exemple #5
0
def computeIntervals(minSize=500):
    scores = pd.read_pickle(utl.outpath + 'real/scores.df')
    scores = (scores.lrh * scores.sh.apply(np.sign)).rename('H')
    regions = utl.scanGenome(scores.abs(), {'H': lambda x: x.abs().mean()}, minSize=minSize, winSize=50000).H
    regions = regions[regions > regions.quantile(0.99)]
    regions = utl.BED.getIntervals(regions, 25000)
    return regions
Exemple #6
0
def scanSFS(XX, winSize=10000):
    import popgen.Estimate as est

    return (
        XX.apply(lambda x: utl.scanGenome(x.dropna(), uf=est.Estimate.getAllEstimatesX, winSize=winSize))
        .unstack("method")
        .stack(["POP", "GEN"])
    )
Exemple #7
0
def Final():
    scores = rutl.loadScores(skipHetChroms=True).abs()
    a = sort(utl.scanGenome(scores.abs(), {'H': lambda x: x.abs().mean(), 'M': lambda x: x.size}))
    intervals = ga.getIntervals(o.H, padding=30000)
    fig = plt.figure(figsize=(7, 1.5), dpi=300);
    pplt.Manhattan(data=a, Outliers=o, shade=intervals.reset_index(), fig=fig, markerSize=2, ticksize=8,
                   sortedAlready=True);
    [pplt.setSize(ax, 5) for ax in fig.get_axes()];
    plt.gcf().subplots_adjust(bottom=0.15);
    plt.suptitle((shades.shape[0], shades['len'].sum() / 1e6), fontsize=8)
    plt.savefig(utl.paperPath + 'new/{}.pdf'.format('CHROM.FDR_0.01'))
Exemple #8
0
def computeBaseSFS(recompute=False):
    path = utl.outpath + 'real/SFS.F0.df'
    if not os.path.exists(path) or recompute:
        x0 = dta.getBaseFreq()
        import popgen.Estimate as est
        sfs = utl.scanGenome(x0, lambda x: est.Estimate.getEstimate(x=x, n=1000, method='all',
                                                                    selectionPredictor=True)).apply(
            lambda x: pd.Series(x[0]), axis=1)
        sfs.to_pickle(path)
        return sfs
    else:
        return pd.read_pickle(path)
Exemple #9
0
import os;

import popgen.Util as utl
import pylab as plt
import popgen.Plots as pplt
import popgen.Estimate as est
# reload(dta)
import popgen.hypoxia.Utils as hutl


a=hutl.load()['L']
d=a.xs('D',level='READ',axis=1)

reload(pplt)
print d
dd=d.groupby(level=[0,1],axis=1).apply(lambda xx: utl.scanGenome(xx,f=lambda x:x.max(),winSize=500000,step=500000).iloc[:,0])
pplt.Manhattan(dd)
# plt.savefig(utl.home+'L.coverage.png', format='png', dpi=100)

L17=hutl.loadscores('L',17).max(1).rename('L17')

L=hutl.loadscores('L',180).max(1)
C=hutl.loadscores('C',180).max(1)
H=hutl.loadscores('H',180).max(1)
all=pd.concat([L,C,H],1);all.columns=['L','C','H']
# H=L.apply(lambda x: x.idxmax(),1).rename('h')
all.std(1)
all.apply(lambda x: utl.scanGenome(x,f=lambda x:x.mean(),winSize=5000,step=1000)[0])
reload(hutl)

Exemple #10
0
import popgen.Util as utl
import popgen.Estimate as est
import popgen.Kyrgys.Utils as kutl
ppath='/home/arya/storage/Data/Human/20130502/AlleleFrequencies/'
import popgen.Plots as pplt
gene='EDAR';
reload(kutl)
padding=500000
a=pd.read_pickle('/media/arya/d4565cf2-d44a-4b67-bf97-226a486c01681/Data/Human/WNG_1000GP_Phase3/EDAR.dfreq.pkl')
a=a.reset_index();a=a.rename(columns={'#CHROM':'CHROM'});a=a.set_index(['CHROM','POS','ID'])
a.index=a.index.droplevel(2)
xx=a.iloc[:,0]
pops=pd.read_csv('/media/arya/d4565cf2-d44a-4b67-bf97-226a486c01681/Data/Human/WNG_1000GP_Phase3/counts.csv',header=None).set_index(0)[1]
pops.apply(np.log)
pops.shape
m=a.apply(lambda xx: utl.scanGenome(xx,f=lambda x:x[(x>0)&(x<1)].size,winSize=100000))
pops['ALL']=pops.sum()
mm=(m/pops).T.dropna().T

mm=mm.apply(lambda x: x/mm['ALL'])
# for gene in ['EDAR','LCT']:
pos,shade=kutl.getPosShade(gene,kutl.getNpop(gene)[-1])
shade.start=int(pos);shade.end=int(pos)
shade,unmap=utl.BED.xmap_bed(shade.reset_index(),38,19)
shade.start=shade.start.astype(int)-padding;shade.end=shade.end.astype(int)+padding
z=pd.read_pickle(ppath+'{}.df'.format(gene)).reorder_levels([2,1,0],1).xs(100,level=2,axis=1).dropna()
# z=kutl.getStats(z)
    # .apply(kutl.normalize);
# z=z['Fst']*z['SFSel']
x=(1-z['case']/z['all'] )['Pi'].dropna().astype(float)
y=(1-z['control']/z['all'] )['Pi'].dropna().astype(float)
Exemple #11
0
import popgen.Run.TimeSeries.RealData.Utils as rutl
import scipy as sc
import popgen.Plots as pplt
import popgen.Run.TimeSeries.RealData.Utils as rutl
import popgen.Run.TimeSeries.RealData.Data as dta
import popgen.TimeSeries.Markov as mkv
S=np.arange(-1,1,0.05).round(2);chroms=['2L','2R','3L','3R','X']
pd.read_pickle('/home/arya/out/real/HMM1x/h5.000000E-01.df').loc[i]
scores = utl.getEuChromatin(rutl.loadScores(skipHetChroms=True)).loc[chroms].rename('score')
cdAll=utl.getEuChromatin(pd.read_pickle('/home/arya/out/real/CD.F59.df').loc[chroms])
freq=lambda x:x.xs('C',level='READ',axis=1).sum(1)/x.xs('D',level='READ',axis=1).sum(1)
s=estimateS(cdAll.groupby(axis=1,level='GEN').apply(freq)[[0,37,59]])

x=pd.read_pickle('/home/arya/out/real/HMM1x/h5.000000E-01.df').loc[chroms,0.5]
pplt.Manhattan(utl.zpvalgenome(utl.scanGenome(utl.zpvalgenome2tail(s))))
(x.s*(x.alt-x.null)).hist(bins=100)
D=cdAll.xs('D',axis=1,level='READ')
d=D.median(1).rename('d')
f=lambda x:(x.alt-x.null)
pplt.Manhattan(utl.scanGenome(x2p(f(x))))
x2p=lambda X2: -pd.Series(1 - sc.stats.chi2.cdf(X2, 1),index=X2.index).apply(np.log)
y=(f(pd.read_pickle('/home/arya/out/real/HMM/h5.000000E-01.df')[0.5]).loc[chroms].rename('y')*pd.read_pickle('/home/arya/out/real/HMM/h5.000000E-01.df')[0.5].s).dropna()
y.sort_values()
y=utl.zpvalgenome(pd.read_pickle('/home/arya/out/real/HMM/h5.000000E-01.df')[0.5].s.loc[chroms])
i=utl.getEuChromatin(y.sort_values()).index[-20]

pplt.GenomeChromosomewise(utl.scanGenome(utl.zpvalgenome(y.abs())))
pplt.GenomeChromosomewise(utl.scanGenome(utl.zpvalgenome(s)))

scan=pd.concat([utl.scanGenome(utl.zpvalgenome(s)).rename('win'),utl.scanGenomeSNP(utl.zpvalgenome(s)).rename('snp')],1)
Exemple #12
0
import seaborn as sns
import pylab as plt
import matplotlib as mpl
import os
import popgen.Plots as pplt

home = os.path.expanduser("~") + "/"
import popgen.Util as utl
import popgen.Estimate as est
import popgen.Simulation as Simulation

topkWin = 100
topkSNP = 100
A = pd.concat(
    [pd.read_pickle(utl.outpath + "real/HMM.df")[0.5], pd.read_pickle(utl.outpath + "real/nullLikelihoods.df")], axis=1
)
A["lr"] = A.s * (A.alt - A.null)
A["alr"] = A.s.abs() * (A.alt - A.null)

a = A.lr
density = pd.concat(
    [
        utl.scanGenome(~a.sort_values(ascending=False).iloc[:topkSNP].isnull(), np.sum, 1e6, 1e4),
        utl.scanGenome(~a.isnull(), np.sum, 1e6, 1e4),
    ],
    axis=1,
).fillna(0)
density /= density.sum()
density.columns = ["Candidate SNPs", "SNPs"]
pplt.Density(density, fname=utl.paperFiguresPath + "candidateSNPDensity.pdf")
Exemple #13
0
def scanOne(a, f, name, fname=None):
    df = sort(utl.scanGenome(a, {name: f, 'Num. of SNPs': lambda x: x.size}))[[name, 'Num. of SNPs']];
    plotOne(df, df[df[name] > df[name].quantile(0.99)], fname=fname)
    return df
Exemple #14
0
def analyzie(minsize=500, winSize=50 * 1000):
    scores = rutl.loadScores()
    df = sort(utl.scanGenome(scores.abs(), {comale: lambda x: x.abs().mean(), 'Num. of SNPs': lambda x: x.size},
                             minSize=minsize))[[comale, 'Num. of SNPs']]
    outlier = df[df[comale] > df[comale].quantile(0.99)]
    plotOne(df, outlier, fname='manhattan.min500');
Exemple #15
0
def Final():
    ############ preparing data
    def saveGOTex(df):
        name = np.unique(df.index)[0]
        print '*' * 80, name
        df = df.sort_values('-log($p$-value)', ascending=False)
        df['Rank'] = range(1, df.shape[0] + 1);
        df = df.iloc[:, [6] + range(6)]
        path = utl.paperPath + '/tables/{}.tex'.format(name);
        df.to_csv(path.replace('.tex', '.csv').replace('/tables/', '/data/'))
        utl.DataframetolaTexTable(df.iloc[:, :-1], alignment=['c', 'c', 'p{3in}', 'c', 'c', 'c'], fname=path)

    goPvalue = lambda x: utl.getPvalFisher(AllGenes=allVariantGenes.values, putativeList=x.values,
                                           myList=g.index.values)
    unpackp = lambda x: [min(6, np.round(x[0], 1)), x[1].loc['Putative', 'myList']]
    # Score = lambda x,f:f(scores.loc[x.CHROM][(scores.loc[x.CHROM].index>=x.start)&(scores.loc[x.CHROM].index<=x.end)])
    sort = lambda df: pd.concat(
            [df[df.index.get_level_values('CHROM') == ch] for ch in ['X', '2L', '2R', '3L', '3R']]).rename(
        columns={'H': r'$\mathcal{H}^+$', 'M': 'Num. of Variants'})
    Genes = loadGeneData().reset_index().set_index('GO')
    Genes = Genes.loc[
        (Genes['FBgn'].groupby(level=0).apply(lambda x: len(x.unique())) > 2).replace({False: None}).dropna().index]
    scores = utl.getEuChromatin(rutl.loadScores(skipHetChroms=True))
    ann = pd.DataFrame(scores).join(loadANN(), how='inner')
    allVariantGenes = ann['Gene_ID'].drop_duplicates()
    # f=lambda x: x[x>=x.quantile(0.9)].mean()
    # geneScores=ann.reset_index().set_index('Gene_ID')[['CHROM','POS',0]].drop_duplicates().groupby(level=0)[0].apply(f)


    ############ computing candidate regions
    scan = utl.scanGenome(scores.abs(), {'H': lambda x: x.abs().mean(), 'M': lambda x: x.size}, winSize=30000)
    o = utl.localOutliers(scan.H, q=0.99);
    o = scan.loc[o.index]
    fig = plt.figure(figsize=(7, 2.5), dpi=300);
    pplt.Manhattan(data=sort(scan), Outliers=sort(o), fig=fig, markerSize=2, ticksize=8, sortedAlready=True);
    [pplt.setSize(ax, 6) for ax in fig.get_axes()];

    pplt.annotate('(A)', ax=fig.axes[0], fontsize=8)
    pplt.annotate('(B)', ax=fig.axes[1], fontsize=8)
    plt.gcf().subplots_adjust(bottom=0.15);
    pplt.savefig('manhattan', 300)
    plt.savefig(utl.paperFiguresPath + 'manhattan.pdf')
    regions = utl.BED.getIntervals(o.H, padding=30000);
    print regions.shape
    intervalGenes = utl.BED.intersection(ann, regions).name.drop_duplicates().reset_index().set_index('name');
    print intervalGenes.size
    g = intervalGenes;
    # intervalGenes
    # g=g[g>=g.quantile(0.)];
    print g.size
    df = Genes.groupby(level=0).apply(lambda x: pd.DataFrame(
        [x.name, x.term.iloc[0]] + unpackp(goPvalue(x.FBgn.drop_duplicates())) + [x.ontology.iloc[0],
                                                                                  x.FBgn.unique().size] + [
            np.intersect1d(x.values, g.index.values)],
        index=['GO ID', 'GO Term', '-log($p$-value)', 'Hits', 'Ontology', 'Num of Genes', 'Genes']).T)
    df = df[(df['-log($p$-value)'] >= 3) & (df.Hits >= 3)]
    df['-log($p$-value)'] = df['-log($p$-value)'].astype(str)
    df = df.set_index('Ontology')
    df.groupby(level=0).apply(saveGOTex);
    print df

    tempGenes = Genes.reset_index().set_index('FBgn').loc[
        np.append(df.set_index('GO ID').loc['GO:0009631'].Genes, df.set_index('GO ID').loc['GO:0009408'].Genes)][
        ['term', 'name', 'GO']].reset_index().set_index('GO').loc[['GO:0009631', 'GO:0009408']].drop_duplicates()
    tempGenes.columns = ['FlyBase ID', 'GO Term', 'Gene Name']
    utl.DataframetolaTexTable(tempGenes, fname=utl.paperPath + '/tables/{}.tex'.format('tempGenes'),
                              alignment=['l', 'l', 'l'])


    regions.to_csv(utl.paperPath + 'data/intervals.csv')

    snps = utl.BED.intersection(scores.reset_index(), regions, 0);
    snps['POS'] = snps.start;
    snps.set_index('POS', append=True, inplace=True)
    snps = snps['name'].astype(float).reset_index().drop_duplicates().set_index(['CHROM', 'POS']).name

    def ff(x):
        y = utl.BED.intersection(scores.reset_index(), x, 0).rename(columns={'start': 'POS'}).set_index('POS',
                                                                                                append=True).name.astype(
            float)
        y = y[y > 0]
        y = y[y >= y.quantile(0.9)]
        print x['len'].iloc[0], y.size
        return y

    cands = regions.reset_index().groupby(level=0).apply(ff).reset_index(level=0).name
    cands.sort_index().reset_index().drop_duplicates().dropna().to_csv(utl.outpath + 'real/gowinda/cands.final.txt',
                                                                       sep='\t', header=None, index=False)
    scores.sort_index().reset_index().drop_duplicates().dropna().to_csv(utl.outpath + 'real/gowinda/allsnps.txt',
                                                                        sep='\t', header=None, index=False)

    name = 'cands.final.out.tsv'
    gowinda = pd.read_csv('/home/arya/out/real/gowinda/{}'.format(name), sep='\t', header=None)[[0, 4, 5, 6, 7, 8, 9]]
    gowinda.columns = ['GO ID', '-log($p$-value)', 'Hits', 'Num of Genes', 'Total Genes', 'GO Term', 'Genes']
    gowinda = gowinda[gowinda.Hits >= 3]
    gowinda['-log($p$-value)'] = -gowinda['-log($p$-value)'].apply(np.log10).round(1)
    gowinda.to_csv(utl.paperPath + 'data/gowinda.all.tsv', sep='\t')
    bp = gowinda.set_index('GO ID').loc[
        Genes[Genes.ontology == 'biological_process'].index.unique().rename('GO ID')].dropna()
    bp.to_csv(utl.paperPath + 'data/gowinda.bp.tsv', sep='\t')
    utl.DataframetolaTexTable(bp.reset_index()[['GO ID', 'GO Term', '-log($p$-value)']], alignment=['c', 'p{4in}', 'c'],
                              fname=utl.paperPath + 'tables/gowinda.tex')

    map(len, (Genes.index.unique(), bp.index.unique(), df.loc['biological_process']['GO ID'].unique())), len(
        np.intersect1d(bp.index.unique(), df['GO ID'].unique()))

    pval = utl.getPvalFisher(Genes[Genes.ontology == 'biological_process'].index.unique(), bp.index.unique(),
                             df.loc['biological_process']['GO ID'].unique())
    print pval

    stats = pd.Series(None, name='Value')

    stats['Num. of Vatiants'] = scores.size
    stats['Num. of Candidate Intervals'] = regions.shape[0]
    stats['Total Num. of Genes'] = loadGeneCoordinates().shape[0]
    stats['Num. of Variant Genes'] = ann['Gene_ID'].unique().shape[0]
    stats['Num. of Genes within Candidate Intervals'] = intervalGenes.shape[0]
    stats['Total Num. of GO'] = len(loadGeneData().index.unique())
    stats['Num. of GO with 3 or More Genes'] = len(Genes.index.unique())
    stats['Num. of Candidate Variants for Gowinda'] = cands.size
    stats = stats.apply(lambda x: '{:,.0f}'.format(x))
    stats.index.name = 'Statistic'
    print stats
    utl.DataframetolaTexTable(stats.reset_index(), fname=utl.paperPath + 'tables/stats.tex', alignment=['l', 'r'])