def computeStatistics(): cols = pd.MultiIndex.from_tuples( map(lambda x: (x[0], int(x[1])), ' C1 C2 C3 H1 H2 H3 L1 L2 L3'.split()), names=['POP', 'REP']) a = pd.read_csv(path + 'tot.snp.ref.freqs', sep='\t', header=None, index_col=range(4), names=['CHROM', 'POS', 'REF', 'ALT'] + range(9)) a.columns = cols pairwise = pd.concat([((a[a.columns[i]] + a[a.columns[j]]) / 2).rename( ''.join(map(str, a.columns[i])) + ''.join(map(str, a.columns[j]))) for i in range(a.shape[1]) for j in range(i + 1, a.shape[1])], axis=1) pairwise.to_pickle(path + 'pairwise.population.df') reload(est) def unroll(all): all = pd.concat([all.applymap(lambda x: x[k]) for k in all.iloc[0, 0].keys()], keys=all.iloc[0, 0].keys(), axis=1) all.columns.names = ['STAT'] + list(all.columns.names[1:]) return all single = unroll(a.groupby(level=[0], axis=1).apply( lambda x: utl.scanGenome(x, f=lambda x: est.Estimate.getEstimate(x, n=200, method='all'))[x.name])) single.to_pickle(path + 'single.df') pairwise = unroll(pairwise.groupby(level=[0], axis=1).apply( lambda x: utl.scanGenome(x, f=lambda x: est.Estimate.getEstimate(x, n=400, method='all'))[x.name])) pairwise.to_pickle(path + 'pairwise.df')
def computeComale(name='h50.df', recompute=False, q=0.99): path = utl.outpath + 'real/HMM/h50.COMALE.df' if not os.path.exists(path) or recompute: df = pd.read_pickle(utl.outpath + 'real/HMM/' + name)[0.5] df['lr'] = (df.alt - df.null) * df.s null = df.copy(True) np.random.shuffle(null.values) fcomale = {'COMALE': lambda x: x[x >= x.quantile(q)].mean(), 'M': lambda x: x.size}; alt = utl.scanGenome(df.lr, fcomale, minSize=200) null = utl.scanGenome(null.lr, fcomale, minSize=200); null.columns = ['COMALENC', 'M'] alt = pd.concat([null.COMALENC, alt], axis=1) alt.to_pickle(path) return alt else: return pd.read_pickle(path)
def outlier(): scores = rutl.removeHeteroChromatin(rutl.loadScores()) field = comale; df = sort(utl.scanGenome(scores.abs(), {field: lambda x: x.abs().mean(), 'Num. of SNPs': lambda x: x.size}))[ [field, 'Num. of SNPs']] a = df.iloc[:, 0] a = a.rename('Global Outliers'); o = a[a > a.quantile(0.99)] o.to_pickle(utl.outpath + 'real/outliers.global.df') fig = plt.figure(figsize=(7, 1.5), dpi=300); pplt.Manhattan(data=a, Outliers=pd.DataFrame(o), fig=fig, markerSize=2, ticksize=8, sortedAlready=True); [pplt.setSize(ax, 5) for ax in fig.get_axes()]; plt.gcf().subplots_adjust(bottom=0.15); plt.savefig(utl.paperPath + 'new/{}.pdf'.format('global')) a = a.rename('Chrom Outliers'); o = a.groupby(level=0).apply(lambda x: x[x > x.quantile(0.99)].loc[x.name]) o.to_pickle(utl.outpath + 'real/outliers.chrom.df') fig = plt.figure(figsize=(7, 1.5), dpi=300); pplt.Manhattan(data=a, Outliers=pd.DataFrame(o), fig=fig, markerSize=2, ticksize=8, sortedAlready=True); [pplt.setSize(ax, 5) for ax in fig.get_axes()] plt.gcf().subplots_adjust(bottom=0.15); plt.savefig(utl.paperPath + 'new/{}.pdf'.format('chrom')) a = a.rename('Local Outliers'); o = localOutliers(a) o.to_pickle(utl.outpath + 'real/outliers.local.df') fig = plt.figure(figsize=(7, 1.5), dpi=300); pplt.Manhattan(data=a, Outliers=pd.DataFrame(o), fig=fig, markerSize=2, ticksize=8, sortedAlready=True); [pplt.setSize(ax, 5) for ax in fig.get_axes()] plt.gcf().subplots_adjust(bottom=0.15); plt.savefig(utl.paperPath + 'new/{}.pdf'.format('local'))
def scanSFS(): scores = rutl.loadScores() field = comale; df = sort(utl.scanGenome(scores.abs(), {field: lambda x: x.abs().mean(), 'Num. of SNPs': lambda x: x.size}))[ [field, 'Num. of SNPs']] plotOne(df, df[df[field] > df[field].quantile(0.99)], fname='all') nu0 = rutl.getNut(0) nut = rutl.getNut(59) reload(rutl) # n= int(pd.read_pickle(utl.outpath + 'real/CD.F59.df').loc[:,pd.IndexSlice[:,0,'D']].mean().mean()) n = 100 SFSelect = lambda x: est.Estimate.getEstimate(x=x, method='SFSelect', n=n) sf0 = scanOne(nu0, SFSelect, 'SFSelect.Base', 'SFSelect.Base'); SFSelect = lambda x: est.Estimate.getEstimate(x=x, method='SFSelect', n=n) sft = scanOne(nut, SFSelect, 'SFSelect.Final', 'SFSelect.Final') sfr = pd.concat( [(sft.iloc[:, 0] - sf0.iloc[:, 0]).rename('SFS(59)-SFS(0)'), sf0.iloc[:, 0], sft.iloc[:, 0], df.iloc[:, 0]], axis=1) outlier = sfr[sfr.iloc[:, 0] > sfr.iloc[:, 0].quantile(0.99)] sfr.loc[(sfr.iloc[:, 0] < 0).values, sfr.columns[0]] = None fig = plt.figure(figsize=(7, 4.5), dpi=300); pplt.Manhattan(data=sfr, Outliers=outlier, fig=fig, markerSize=2, ticksize=8, sortedAlready=True) [pplt.setSize(ax, 5) for ax in fig.get_axes()] plt.savefig(utl.paperPath + 'new/{}.pdf'.format('sfs-clear'))
def computeIntervals(minSize=500): scores = pd.read_pickle(utl.outpath + 'real/scores.df') scores = (scores.lrh * scores.sh.apply(np.sign)).rename('H') regions = utl.scanGenome(scores.abs(), {'H': lambda x: x.abs().mean()}, minSize=minSize, winSize=50000).H regions = regions[regions > regions.quantile(0.99)] regions = utl.BED.getIntervals(regions, 25000) return regions
def scanSFS(XX, winSize=10000): import popgen.Estimate as est return ( XX.apply(lambda x: utl.scanGenome(x.dropna(), uf=est.Estimate.getAllEstimatesX, winSize=winSize)) .unstack("method") .stack(["POP", "GEN"]) )
def Final(): scores = rutl.loadScores(skipHetChroms=True).abs() a = sort(utl.scanGenome(scores.abs(), {'H': lambda x: x.abs().mean(), 'M': lambda x: x.size})) intervals = ga.getIntervals(o.H, padding=30000) fig = plt.figure(figsize=(7, 1.5), dpi=300); pplt.Manhattan(data=a, Outliers=o, shade=intervals.reset_index(), fig=fig, markerSize=2, ticksize=8, sortedAlready=True); [pplt.setSize(ax, 5) for ax in fig.get_axes()]; plt.gcf().subplots_adjust(bottom=0.15); plt.suptitle((shades.shape[0], shades['len'].sum() / 1e6), fontsize=8) plt.savefig(utl.paperPath + 'new/{}.pdf'.format('CHROM.FDR_0.01'))
def computeBaseSFS(recompute=False): path = utl.outpath + 'real/SFS.F0.df' if not os.path.exists(path) or recompute: x0 = dta.getBaseFreq() import popgen.Estimate as est sfs = utl.scanGenome(x0, lambda x: est.Estimate.getEstimate(x=x, n=1000, method='all', selectionPredictor=True)).apply( lambda x: pd.Series(x[0]), axis=1) sfs.to_pickle(path) return sfs else: return pd.read_pickle(path)
import os; import popgen.Util as utl import pylab as plt import popgen.Plots as pplt import popgen.Estimate as est # reload(dta) import popgen.hypoxia.Utils as hutl a=hutl.load()['L'] d=a.xs('D',level='READ',axis=1) reload(pplt) print d dd=d.groupby(level=[0,1],axis=1).apply(lambda xx: utl.scanGenome(xx,f=lambda x:x.max(),winSize=500000,step=500000).iloc[:,0]) pplt.Manhattan(dd) # plt.savefig(utl.home+'L.coverage.png', format='png', dpi=100) L17=hutl.loadscores('L',17).max(1).rename('L17') L=hutl.loadscores('L',180).max(1) C=hutl.loadscores('C',180).max(1) H=hutl.loadscores('H',180).max(1) all=pd.concat([L,C,H],1);all.columns=['L','C','H'] # H=L.apply(lambda x: x.idxmax(),1).rename('h') all.std(1) all.apply(lambda x: utl.scanGenome(x,f=lambda x:x.mean(),winSize=5000,step=1000)[0]) reload(hutl)
import popgen.Util as utl import popgen.Estimate as est import popgen.Kyrgys.Utils as kutl ppath='/home/arya/storage/Data/Human/20130502/AlleleFrequencies/' import popgen.Plots as pplt gene='EDAR'; reload(kutl) padding=500000 a=pd.read_pickle('/media/arya/d4565cf2-d44a-4b67-bf97-226a486c01681/Data/Human/WNG_1000GP_Phase3/EDAR.dfreq.pkl') a=a.reset_index();a=a.rename(columns={'#CHROM':'CHROM'});a=a.set_index(['CHROM','POS','ID']) a.index=a.index.droplevel(2) xx=a.iloc[:,0] pops=pd.read_csv('/media/arya/d4565cf2-d44a-4b67-bf97-226a486c01681/Data/Human/WNG_1000GP_Phase3/counts.csv',header=None).set_index(0)[1] pops.apply(np.log) pops.shape m=a.apply(lambda xx: utl.scanGenome(xx,f=lambda x:x[(x>0)&(x<1)].size,winSize=100000)) pops['ALL']=pops.sum() mm=(m/pops).T.dropna().T mm=mm.apply(lambda x: x/mm['ALL']) # for gene in ['EDAR','LCT']: pos,shade=kutl.getPosShade(gene,kutl.getNpop(gene)[-1]) shade.start=int(pos);shade.end=int(pos) shade,unmap=utl.BED.xmap_bed(shade.reset_index(),38,19) shade.start=shade.start.astype(int)-padding;shade.end=shade.end.astype(int)+padding z=pd.read_pickle(ppath+'{}.df'.format(gene)).reorder_levels([2,1,0],1).xs(100,level=2,axis=1).dropna() # z=kutl.getStats(z) # .apply(kutl.normalize); # z=z['Fst']*z['SFSel'] x=(1-z['case']/z['all'] )['Pi'].dropna().astype(float) y=(1-z['control']/z['all'] )['Pi'].dropna().astype(float)
import popgen.Run.TimeSeries.RealData.Utils as rutl import scipy as sc import popgen.Plots as pplt import popgen.Run.TimeSeries.RealData.Utils as rutl import popgen.Run.TimeSeries.RealData.Data as dta import popgen.TimeSeries.Markov as mkv S=np.arange(-1,1,0.05).round(2);chroms=['2L','2R','3L','3R','X'] pd.read_pickle('/home/arya/out/real/HMM1x/h5.000000E-01.df').loc[i] scores = utl.getEuChromatin(rutl.loadScores(skipHetChroms=True)).loc[chroms].rename('score') cdAll=utl.getEuChromatin(pd.read_pickle('/home/arya/out/real/CD.F59.df').loc[chroms]) freq=lambda x:x.xs('C',level='READ',axis=1).sum(1)/x.xs('D',level='READ',axis=1).sum(1) s=estimateS(cdAll.groupby(axis=1,level='GEN').apply(freq)[[0,37,59]]) x=pd.read_pickle('/home/arya/out/real/HMM1x/h5.000000E-01.df').loc[chroms,0.5] pplt.Manhattan(utl.zpvalgenome(utl.scanGenome(utl.zpvalgenome2tail(s)))) (x.s*(x.alt-x.null)).hist(bins=100) D=cdAll.xs('D',axis=1,level='READ') d=D.median(1).rename('d') f=lambda x:(x.alt-x.null) pplt.Manhattan(utl.scanGenome(x2p(f(x)))) x2p=lambda X2: -pd.Series(1 - sc.stats.chi2.cdf(X2, 1),index=X2.index).apply(np.log) y=(f(pd.read_pickle('/home/arya/out/real/HMM/h5.000000E-01.df')[0.5]).loc[chroms].rename('y')*pd.read_pickle('/home/arya/out/real/HMM/h5.000000E-01.df')[0.5].s).dropna() y.sort_values() y=utl.zpvalgenome(pd.read_pickle('/home/arya/out/real/HMM/h5.000000E-01.df')[0.5].s.loc[chroms]) i=utl.getEuChromatin(y.sort_values()).index[-20] pplt.GenomeChromosomewise(utl.scanGenome(utl.zpvalgenome(y.abs()))) pplt.GenomeChromosomewise(utl.scanGenome(utl.zpvalgenome(s))) scan=pd.concat([utl.scanGenome(utl.zpvalgenome(s)).rename('win'),utl.scanGenomeSNP(utl.zpvalgenome(s)).rename('snp')],1)
import seaborn as sns import pylab as plt import matplotlib as mpl import os import popgen.Plots as pplt home = os.path.expanduser("~") + "/" import popgen.Util as utl import popgen.Estimate as est import popgen.Simulation as Simulation topkWin = 100 topkSNP = 100 A = pd.concat( [pd.read_pickle(utl.outpath + "real/HMM.df")[0.5], pd.read_pickle(utl.outpath + "real/nullLikelihoods.df")], axis=1 ) A["lr"] = A.s * (A.alt - A.null) A["alr"] = A.s.abs() * (A.alt - A.null) a = A.lr density = pd.concat( [ utl.scanGenome(~a.sort_values(ascending=False).iloc[:topkSNP].isnull(), np.sum, 1e6, 1e4), utl.scanGenome(~a.isnull(), np.sum, 1e6, 1e4), ], axis=1, ).fillna(0) density /= density.sum() density.columns = ["Candidate SNPs", "SNPs"] pplt.Density(density, fname=utl.paperFiguresPath + "candidateSNPDensity.pdf")
def scanOne(a, f, name, fname=None): df = sort(utl.scanGenome(a, {name: f, 'Num. of SNPs': lambda x: x.size}))[[name, 'Num. of SNPs']]; plotOne(df, df[df[name] > df[name].quantile(0.99)], fname=fname) return df
def analyzie(minsize=500, winSize=50 * 1000): scores = rutl.loadScores() df = sort(utl.scanGenome(scores.abs(), {comale: lambda x: x.abs().mean(), 'Num. of SNPs': lambda x: x.size}, minSize=minsize))[[comale, 'Num. of SNPs']] outlier = df[df[comale] > df[comale].quantile(0.99)] plotOne(df, outlier, fname='manhattan.min500');
def Final(): ############ preparing data def saveGOTex(df): name = np.unique(df.index)[0] print '*' * 80, name df = df.sort_values('-log($p$-value)', ascending=False) df['Rank'] = range(1, df.shape[0] + 1); df = df.iloc[:, [6] + range(6)] path = utl.paperPath + '/tables/{}.tex'.format(name); df.to_csv(path.replace('.tex', '.csv').replace('/tables/', '/data/')) utl.DataframetolaTexTable(df.iloc[:, :-1], alignment=['c', 'c', 'p{3in}', 'c', 'c', 'c'], fname=path) goPvalue = lambda x: utl.getPvalFisher(AllGenes=allVariantGenes.values, putativeList=x.values, myList=g.index.values) unpackp = lambda x: [min(6, np.round(x[0], 1)), x[1].loc['Putative', 'myList']] # Score = lambda x,f:f(scores.loc[x.CHROM][(scores.loc[x.CHROM].index>=x.start)&(scores.loc[x.CHROM].index<=x.end)]) sort = lambda df: pd.concat( [df[df.index.get_level_values('CHROM') == ch] for ch in ['X', '2L', '2R', '3L', '3R']]).rename( columns={'H': r'$\mathcal{H}^+$', 'M': 'Num. of Variants'}) Genes = loadGeneData().reset_index().set_index('GO') Genes = Genes.loc[ (Genes['FBgn'].groupby(level=0).apply(lambda x: len(x.unique())) > 2).replace({False: None}).dropna().index] scores = utl.getEuChromatin(rutl.loadScores(skipHetChroms=True)) ann = pd.DataFrame(scores).join(loadANN(), how='inner') allVariantGenes = ann['Gene_ID'].drop_duplicates() # f=lambda x: x[x>=x.quantile(0.9)].mean() # geneScores=ann.reset_index().set_index('Gene_ID')[['CHROM','POS',0]].drop_duplicates().groupby(level=0)[0].apply(f) ############ computing candidate regions scan = utl.scanGenome(scores.abs(), {'H': lambda x: x.abs().mean(), 'M': lambda x: x.size}, winSize=30000) o = utl.localOutliers(scan.H, q=0.99); o = scan.loc[o.index] fig = plt.figure(figsize=(7, 2.5), dpi=300); pplt.Manhattan(data=sort(scan), Outliers=sort(o), fig=fig, markerSize=2, ticksize=8, sortedAlready=True); [pplt.setSize(ax, 6) for ax in fig.get_axes()]; pplt.annotate('(A)', ax=fig.axes[0], fontsize=8) pplt.annotate('(B)', ax=fig.axes[1], fontsize=8) plt.gcf().subplots_adjust(bottom=0.15); pplt.savefig('manhattan', 300) plt.savefig(utl.paperFiguresPath + 'manhattan.pdf') regions = utl.BED.getIntervals(o.H, padding=30000); print regions.shape intervalGenes = utl.BED.intersection(ann, regions).name.drop_duplicates().reset_index().set_index('name'); print intervalGenes.size g = intervalGenes; # intervalGenes # g=g[g>=g.quantile(0.)]; print g.size df = Genes.groupby(level=0).apply(lambda x: pd.DataFrame( [x.name, x.term.iloc[0]] + unpackp(goPvalue(x.FBgn.drop_duplicates())) + [x.ontology.iloc[0], x.FBgn.unique().size] + [ np.intersect1d(x.values, g.index.values)], index=['GO ID', 'GO Term', '-log($p$-value)', 'Hits', 'Ontology', 'Num of Genes', 'Genes']).T) df = df[(df['-log($p$-value)'] >= 3) & (df.Hits >= 3)] df['-log($p$-value)'] = df['-log($p$-value)'].astype(str) df = df.set_index('Ontology') df.groupby(level=0).apply(saveGOTex); print df tempGenes = Genes.reset_index().set_index('FBgn').loc[ np.append(df.set_index('GO ID').loc['GO:0009631'].Genes, df.set_index('GO ID').loc['GO:0009408'].Genes)][ ['term', 'name', 'GO']].reset_index().set_index('GO').loc[['GO:0009631', 'GO:0009408']].drop_duplicates() tempGenes.columns = ['FlyBase ID', 'GO Term', 'Gene Name'] utl.DataframetolaTexTable(tempGenes, fname=utl.paperPath + '/tables/{}.tex'.format('tempGenes'), alignment=['l', 'l', 'l']) regions.to_csv(utl.paperPath + 'data/intervals.csv') snps = utl.BED.intersection(scores.reset_index(), regions, 0); snps['POS'] = snps.start; snps.set_index('POS', append=True, inplace=True) snps = snps['name'].astype(float).reset_index().drop_duplicates().set_index(['CHROM', 'POS']).name def ff(x): y = utl.BED.intersection(scores.reset_index(), x, 0).rename(columns={'start': 'POS'}).set_index('POS', append=True).name.astype( float) y = y[y > 0] y = y[y >= y.quantile(0.9)] print x['len'].iloc[0], y.size return y cands = regions.reset_index().groupby(level=0).apply(ff).reset_index(level=0).name cands.sort_index().reset_index().drop_duplicates().dropna().to_csv(utl.outpath + 'real/gowinda/cands.final.txt', sep='\t', header=None, index=False) scores.sort_index().reset_index().drop_duplicates().dropna().to_csv(utl.outpath + 'real/gowinda/allsnps.txt', sep='\t', header=None, index=False) name = 'cands.final.out.tsv' gowinda = pd.read_csv('/home/arya/out/real/gowinda/{}'.format(name), sep='\t', header=None)[[0, 4, 5, 6, 7, 8, 9]] gowinda.columns = ['GO ID', '-log($p$-value)', 'Hits', 'Num of Genes', 'Total Genes', 'GO Term', 'Genes'] gowinda = gowinda[gowinda.Hits >= 3] gowinda['-log($p$-value)'] = -gowinda['-log($p$-value)'].apply(np.log10).round(1) gowinda.to_csv(utl.paperPath + 'data/gowinda.all.tsv', sep='\t') bp = gowinda.set_index('GO ID').loc[ Genes[Genes.ontology == 'biological_process'].index.unique().rename('GO ID')].dropna() bp.to_csv(utl.paperPath + 'data/gowinda.bp.tsv', sep='\t') utl.DataframetolaTexTable(bp.reset_index()[['GO ID', 'GO Term', '-log($p$-value)']], alignment=['c', 'p{4in}', 'c'], fname=utl.paperPath + 'tables/gowinda.tex') map(len, (Genes.index.unique(), bp.index.unique(), df.loc['biological_process']['GO ID'].unique())), len( np.intersect1d(bp.index.unique(), df['GO ID'].unique())) pval = utl.getPvalFisher(Genes[Genes.ontology == 'biological_process'].index.unique(), bp.index.unique(), df.loc['biological_process']['GO ID'].unique()) print pval stats = pd.Series(None, name='Value') stats['Num. of Vatiants'] = scores.size stats['Num. of Candidate Intervals'] = regions.shape[0] stats['Total Num. of Genes'] = loadGeneCoordinates().shape[0] stats['Num. of Variant Genes'] = ann['Gene_ID'].unique().shape[0] stats['Num. of Genes within Candidate Intervals'] = intervalGenes.shape[0] stats['Total Num. of GO'] = len(loadGeneData().index.unique()) stats['Num. of GO with 3 or More Genes'] = len(Genes.index.unique()) stats['Num. of Candidate Variants for Gowinda'] = cands.size stats = stats.apply(lambda x: '{:,.0f}'.format(x)) stats.index.name = 'Statistic' print stats utl.DataframetolaTexTable(stats.reset_index(), fname=utl.paperPath + 'tables/stats.tex', alignment=['l', 'r'])