def gowinda(): gow = pd.read_csv(utl.outpath + 'real/gowinda/cand.q99.out', sep='\t', header=None) arya = np.array("""GO:0004046 GO:0015101 GO:0007501 GO:0004601 GO:0006979 GO:0009312 GO:0004653 GO:0040014 GO:0016485 GO:0006030 GO:0020037 GO:0008061 GO:0004702""".split()) np.intersect1d(gow[0].unique().astype(str), arya).shape Genes = pd.read_pickle(utl.outpath + 'real/GO.df') pval, cont = utl.getPvalFisher(Genes.reset_index().GO.unique(), gow[0], arya)
def Final(): ############ preparing data def saveGOTex(df): name = np.unique(df.index)[0] print '*' * 80, name df = df.sort_values('-log($p$-value)', ascending=False) df['Rank'] = range(1, df.shape[0] + 1); df = df.iloc[:, [6] + range(6)] path = utl.paperPath + '/tables/{}.tex'.format(name); df.to_csv(path.replace('.tex', '.csv').replace('/tables/', '/data/')) utl.DataframetolaTexTable(df.iloc[:, :-1], alignment=['c', 'c', 'p{3in}', 'c', 'c', 'c'], fname=path) goPvalue = lambda x: utl.getPvalFisher(AllGenes=allVariantGenes.values, putativeList=x.values, myList=g.index.values) unpackp = lambda x: [min(6, np.round(x[0], 1)), x[1].loc['Putative', 'myList']] # Score = lambda x,f:f(scores.loc[x.CHROM][(scores.loc[x.CHROM].index>=x.start)&(scores.loc[x.CHROM].index<=x.end)]) sort = lambda df: pd.concat( [df[df.index.get_level_values('CHROM') == ch] for ch in ['X', '2L', '2R', '3L', '3R']]).rename( columns={'H': r'$\mathcal{H}^+$', 'M': 'Num. of Variants'}) Genes = loadGeneData().reset_index().set_index('GO') Genes = Genes.loc[ (Genes['FBgn'].groupby(level=0).apply(lambda x: len(x.unique())) > 2).replace({False: None}).dropna().index] scores = utl.getEuChromatin(rutl.loadScores(skipHetChroms=True)) ann = pd.DataFrame(scores).join(loadANN(), how='inner') allVariantGenes = ann['Gene_ID'].drop_duplicates() # f=lambda x: x[x>=x.quantile(0.9)].mean() # geneScores=ann.reset_index().set_index('Gene_ID')[['CHROM','POS',0]].drop_duplicates().groupby(level=0)[0].apply(f) ############ computing candidate regions scan = utl.scanGenome(scores.abs(), {'H': lambda x: x.abs().mean(), 'M': lambda x: x.size}, winSize=30000) o = utl.localOutliers(scan.H, q=0.99); o = scan.loc[o.index] fig = plt.figure(figsize=(7, 2.5), dpi=300); pplt.Manhattan(data=sort(scan), Outliers=sort(o), fig=fig, markerSize=2, ticksize=8, sortedAlready=True); [pplt.setSize(ax, 6) for ax in fig.get_axes()]; pplt.annotate('(A)', ax=fig.axes[0], fontsize=8) pplt.annotate('(B)', ax=fig.axes[1], fontsize=8) plt.gcf().subplots_adjust(bottom=0.15); pplt.savefig('manhattan', 300) plt.savefig(utl.paperFiguresPath + 'manhattan.pdf') regions = utl.BED.getIntervals(o.H, padding=30000); print regions.shape intervalGenes = utl.BED.intersection(ann, regions).name.drop_duplicates().reset_index().set_index('name'); print intervalGenes.size g = intervalGenes; # intervalGenes # g=g[g>=g.quantile(0.)]; print g.size df = Genes.groupby(level=0).apply(lambda x: pd.DataFrame( [x.name, x.term.iloc[0]] + unpackp(goPvalue(x.FBgn.drop_duplicates())) + [x.ontology.iloc[0], x.FBgn.unique().size] + [ np.intersect1d(x.values, g.index.values)], index=['GO ID', 'GO Term', '-log($p$-value)', 'Hits', 'Ontology', 'Num of Genes', 'Genes']).T) df = df[(df['-log($p$-value)'] >= 3) & (df.Hits >= 3)] df['-log($p$-value)'] = df['-log($p$-value)'].astype(str) df = df.set_index('Ontology') df.groupby(level=0).apply(saveGOTex); print df tempGenes = Genes.reset_index().set_index('FBgn').loc[ np.append(df.set_index('GO ID').loc['GO:0009631'].Genes, df.set_index('GO ID').loc['GO:0009408'].Genes)][ ['term', 'name', 'GO']].reset_index().set_index('GO').loc[['GO:0009631', 'GO:0009408']].drop_duplicates() tempGenes.columns = ['FlyBase ID', 'GO Term', 'Gene Name'] utl.DataframetolaTexTable(tempGenes, fname=utl.paperPath + '/tables/{}.tex'.format('tempGenes'), alignment=['l', 'l', 'l']) regions.to_csv(utl.paperPath + 'data/intervals.csv') snps = utl.BED.intersection(scores.reset_index(), regions, 0); snps['POS'] = snps.start; snps.set_index('POS', append=True, inplace=True) snps = snps['name'].astype(float).reset_index().drop_duplicates().set_index(['CHROM', 'POS']).name def ff(x): y = utl.BED.intersection(scores.reset_index(), x, 0).rename(columns={'start': 'POS'}).set_index('POS', append=True).name.astype( float) y = y[y > 0] y = y[y >= y.quantile(0.9)] print x['len'].iloc[0], y.size return y cands = regions.reset_index().groupby(level=0).apply(ff).reset_index(level=0).name cands.sort_index().reset_index().drop_duplicates().dropna().to_csv(utl.outpath + 'real/gowinda/cands.final.txt', sep='\t', header=None, index=False) scores.sort_index().reset_index().drop_duplicates().dropna().to_csv(utl.outpath + 'real/gowinda/allsnps.txt', sep='\t', header=None, index=False) name = 'cands.final.out.tsv' gowinda = pd.read_csv('/home/arya/out/real/gowinda/{}'.format(name), sep='\t', header=None)[[0, 4, 5, 6, 7, 8, 9]] gowinda.columns = ['GO ID', '-log($p$-value)', 'Hits', 'Num of Genes', 'Total Genes', 'GO Term', 'Genes'] gowinda = gowinda[gowinda.Hits >= 3] gowinda['-log($p$-value)'] = -gowinda['-log($p$-value)'].apply(np.log10).round(1) gowinda.to_csv(utl.paperPath + 'data/gowinda.all.tsv', sep='\t') bp = gowinda.set_index('GO ID').loc[ Genes[Genes.ontology == 'biological_process'].index.unique().rename('GO ID')].dropna() bp.to_csv(utl.paperPath + 'data/gowinda.bp.tsv', sep='\t') utl.DataframetolaTexTable(bp.reset_index()[['GO ID', 'GO Term', '-log($p$-value)']], alignment=['c', 'p{4in}', 'c'], fname=utl.paperPath + 'tables/gowinda.tex') map(len, (Genes.index.unique(), bp.index.unique(), df.loc['biological_process']['GO ID'].unique())), len( np.intersect1d(bp.index.unique(), df['GO ID'].unique())) pval = utl.getPvalFisher(Genes[Genes.ontology == 'biological_process'].index.unique(), bp.index.unique(), df.loc['biological_process']['GO ID'].unique()) print pval stats = pd.Series(None, name='Value') stats['Num. of Vatiants'] = scores.size stats['Num. of Candidate Intervals'] = regions.shape[0] stats['Total Num. of Genes'] = loadGeneCoordinates().shape[0] stats['Num. of Variant Genes'] = ann['Gene_ID'].unique().shape[0] stats['Num. of Genes within Candidate Intervals'] = intervalGenes.shape[0] stats['Total Num. of GO'] = len(loadGeneData().index.unique()) stats['Num. of GO with 3 or More Genes'] = len(Genes.index.unique()) stats['Num. of Candidate Variants for Gowinda'] = cands.size stats = stats.apply(lambda x: '{:,.0f}'.format(x)) stats.index.name = 'Statistic' print stats utl.DataframetolaTexTable(stats.reset_index(), fname=utl.paperPath + 'tables/stats.tex', alignment=['l', 'r'])
def IntervalAnalysis(): q = 0.9; padding = 25000; windowMinSNP = 500 ann = loadANN() scores = pd.read_pickle(utl.outpath + 'real/scores.df') scores = (scores.lrh * scores.sh.apply(np.sign)).rename('H') a = pd.DataFrame(scores).join(ann, how='inner') regions = computeIntervals() csv = regions.reset_index().iloc[:, :3]; csv[' '] = range(1, csv.shape[0] + 1) utl.DataframetolaTexTable(csv.iloc[:, [3, 0, 1, 2]], fname=utl.paperPath + 'new/intervals.tex') intervalGenes = utl.BED.intersection(a, regions).name.drop_duplicates().reset_index().set_index('name') intervalGenes.columns = ['chr']; # pd.Series(intervalGenes.index.unique()).to_csv(utl.paperFiguresPath + '../new/intervalGenes.{}.tsv'.format(field), sep='\t', # index=None) intervalGenes.index.name = 'FBgn' gscores = pd.read_pickle(utl.outpath + 'real/geneScores.df') col = 'transcript' # for col in gscores.columns: # if col != 'transcript': continue data = gscores Genes = loadGeneData().reset_index().set_index('GO') # def getGOinfoforVineet(go): # return Genes.loc[go].reset_index().iloc[:, :4].drop_duplicates().set_index('FBgn').join( # intervalGenes).dropna().reset_index() # # getGOinfoforVineet('GO:0004046') # getGOinfoforVineet('GO:0006520') # getGOinfoforVineet('GO:0004601') res = [] for go in Genes.index.unique(): if Genes[Genes.index == go].shape[0]: gogenes = Genes[Genes.index == go]['FBgn'].drop_duplicates() try: p, cont = utl.getPvalFisher(AllGenes=data.index.values, putativeList=gogenes.values, myList=intervalGenes.index.values) res += [(go, Genes[Genes.index == go].term.iloc[0], p, cont.loc['Putative', 'myList'], gogenes.shape[0])] except: pass df = pd.DataFrame(res, columns=['GO', 'term', 'p', 'hit', 'numGenes']).set_index('GO') df = df[df.p > 3] df = df[df.hit >= 3] df df = df.reset_index().sort_values('p', ascending=False) df.p = df.p.round(1) df.columns = ['GO ID', 'GO Term', '-log($p$-value)', 'Hits', 'Num of Genes'] df['Rank'] = range(1, df.shape[0] + 1) df = df.iloc[:, [-1] + range(df.shape[1])[:-1]] np.intersect1d(df['GO ID'].values, 'arya') arya = np.array("""GO:0004046 GO:0015101 GO:0007501 GO:0004601 GO:0006979 GO:0009312 GO:0004653 GO:0040014 GO:0016485 GO:0006030 GO:0020037 GO:0008061 GO:0004702""".split()) path = utl.paperFiguresPath + '../new/{}Fisher.tex'.format('transcript') utl.DataframetolaTexTable(df, alignment=['c', 'c', 'p{3in}', 'c', 'c', 'c'], fname=path)