def RunSnvplot(args): cfg = Parse.generate_snvplot_cfg(args) Parse.print_snvplot_options(cfg) if not cfg['debug']: logging.disable(logging.CRITICAL) ro.r('suppressMessages(library(ggplot2))') ro.r('suppressMessages(library(grid))') handle=pysam.TabixFile(filename=cfg['file'],parser=pysam.asVCF()) header = [x for x in handle.header] skip_rows = len(header)-1 cols = header[-1].split() pcols = cfg['pcol'].split(',') cols_extract = [cfg['chrcol'],cfg['bpcol']] + pcols if cfg['qq_strat_freq']: if cfg['freqcol'] not in cols: print Process.Error("frequency column " + cfg['freqcol'] + " not found, unable to proceed with frequency stratified plots").out return 1 else: cols_extract = cols_extract + [cfg['freqcol']] print "frequency column " + cfg['freqcol'] + " found" if cfg['qq_strat_mac']: if cfg['maccol'] not in cols: print Process.Error("minor allele count column " + cfg['maccol'] + " not found, unable to proceed with minor allele count stratified plots").out return 1 else: cols_extract = cols_extract + [cfg['maccol']] print "minor allele count column " + cfg['maccol'] + " found" print "importing data" r = pd.read_table(cfg['file'],sep='\t',skiprows=skip_rows,usecols=cols_extract,compression='gzip') print str(r.shape[0]) + " total variants found" for pcol in pcols: print "plotting p-values for column " + pcol + " ..." results = r[[cfg['chrcol'],cfg['bpcol'],cfg['freqcol'],pcol]] if cfg['freqcol'] in r else r[[cfg['chrcol'],cfg['bpcol'],pcol]] results.dropna(inplace=True) results = results[(results[pcol] > 0) & (results[pcol] <= 1)].reset_index(drop=True) print " " + str(results.shape[0]) + " variants with plottable p-values" results['logp'] = -1 * np.log10(results[pcol]) + 0.0 ro.globalenv['results'] = results l = np.median(scipy.chi2.ppf([1-x for x in results[pcol].tolist()], df=1))/scipy.chi2.ppf(0.5,1) # in R: median(qchisq(results$p, df=1, lower.tail=FALSE))/qchisq(0.5,1) print " genomic inflation (all variants) = " + str(l) if cfg['qq']: print " generating standard qq plot" print " minimum p-value: " + str(np.min(results[pcol])) a = -1 * np.log10(ro.r('ppoints(' + str(len(results.index)) + ')')) a.sort() results.sort_values(by=['logp'], inplace=True) print " maximum -1*log10(p-value): " + str(np.max(results['logp'])) ci_upper = -1 * np.log10(scipy.beta.ppf(0.95, range(1,len(results[pcol]) + 1), range(len(results[pcol]),0,-1))) ci_upper.sort() ci_lower = -1 * np.log10(scipy.beta.ppf(0.05, range(1,len(results[pcol]) + 1), range(len(results[pcol]),0,-1))) ci_lower.sort() ro.globalenv['df'] = ro.DataFrame({'a': ro.FloatVector(a), 'b': ro.FloatVector(results['logp']), 'ci_lower': ro.FloatVector(ci_lower), 'ci_upper': ro.FloatVector(ci_upper)}) dftext_label = 'lambda %~~% ' + str(l) ro.globalenv['dftext'] = ro.DataFrame({'x': ro.r('Inf'), 'y': 0.5, 'lab': dftext_label}) if cfg['ext'] == 'tiff': ggsave = 'ggsave(filename="%s",plot=pp,width=4,height=4,units="in",bg="white",compression="lzw",dpi=300)' % (cfg['out'] + '.' + pcol + '.qq.tiff') elif cfg['ext'] == 'eps': ggsave = 'ggsave(filename="%s",plot=pp,width=4,height=4,bg="white",horizontal=True)' % (cfg['out'] + '.' + pcol + '.qq.eps') else: ggsave = 'ggsave(filename="%s",plot=pp,width=4,height=4,bg="white")' % (cfg['out'] + '.' + pcol + '.qq.pdf') ro.r(""" gp<-ggplot(df) pp<-gp + aes_string(x='a',y='b') + geom_ribbon(aes_string(x='a',ymin='ci_lower',ymax='ci_upper'), data=df, alpha=0.25, fill='black') + geom_point(size=2) + geom_abline(intercept=0, slope=1, alpha=0.5) + scale_x_discrete(expression(Expected~~-log[10](italic(p)))) + scale_y_discrete(expression(Observed~~-log[10](italic(p)))) + coord_fixed() + theme_bw(base_size = 12) + geom_text(aes_string(x='x', y='y', label='lab'), data = dftext, colour="black", vjust=0, hjust=1, size = 4, parse=TRUE) + theme(axis.title.x = element_text(vjust=-0.5,size=14), axis.title.y = element_text(vjust=1,angle=90,size=14), legend.position = 'none', panel.background = element_blank(), panel.border = element_blank(), panel.grid.minor = element_blank(), panel.grid.major = element_blank(), axis.line = element_line(colour="black"), axis.text = element_text(size=12)) %s """ % (ggsave)) if np.max(results['logp']) > cfg['crop']: print " generating cropped standard qq plot" ro.r('df$b[df$b > ' + str(cfg['crop']) + ']<-' + str(cfg['crop'])) ro.r('df$shape<-0') ro.r('df$shape[df$b == ' + str(cfg['crop']) + ']<-1') if cfg['ext'] == 'tiff': ggsave = 'ggsave(filename="%s",plot=pp,width=4,height=4,units="in",bg="white",compression="lzw",dpi=300)' % (cfg['out'] + '.' + pcol + '.qq.cropped.tiff') elif cfg['ext'] == 'eps': ggsave = 'ggsave(filename="%s",plot=pp,width=4,height=4,bg="white",horizontal=True)' % (cfg['out'] + '.' + pcol + '.qq.cropped.eps') else: ggsave = 'ggsave(filename="%s",plot=pp,width=4,height=4,bg="white")' % (cfg['out'] + '.' + pcol + '.qq.cropped.pdf') ro.r(""" gp<-ggplot(df) pp<-gp + aes_string(x='a',y='b') + geom_ribbon(aes_string(x='a',ymin='ci_lower',ymax='ci_upper'), data=df, alpha=0.25, fill='black') + geom_point(aes(shape=factor(shape)),size=2) + geom_abline(intercept=0, slope=1, alpha=0.5) + scale_x_discrete(expression(Expected~~-log[10](italic(p)))) + scale_y_discrete(expression(Observed~~-log[10](italic(p)))) + coord_fixed() + theme_bw(base_size = 12) + geom_text(aes_string(x='x', y='y', label='lab'), data = dftext, colour="black", vjust=0, hjust=1, size = 4, parse=TRUE) + theme(axis.title.x = element_text(vjust=-0.5,size=14), axis.title.y = element_text(vjust=1,angle=90,size=14), legend.position = 'none', panel.background = element_blank(), panel.border = element_blank(), panel.grid.minor = element_blank(), panel.grid.major = element_blank(), axis.line = element_line(colour="black"), axis.text = element_text(size=12)) %s """ % (ggsave)) if cfg['qq_strat_freq']: print " generating frequency stratified qq plot" strat_ticks = [0.005, 0.01, 0.03, 0.05] results['UGA___QQ_BIN___'] = 'E' results.loc[(results[cfg['freqcol']] >= 0.01) & (results[cfg['freqcol']] <= 0.99),'UGA___QQ_BIN___'] = 'D' results.loc[(results[cfg['freqcol']] >= 0.03) & (results[cfg['freqcol']] <= 0.97),'UGA___QQ_BIN___'] = 'C' results.loc[(results[cfg['freqcol']] >= 0.05) & (results[cfg['freqcol']] <= 0.95),'UGA___QQ_BIN___'] = 'B' results.loc[(results[cfg['freqcol']] >= 0.1) & (results[cfg['freqcol']] <= 0.9),'UGA___QQ_BIN___'] = 'A' lA='NA' lB='NA' lC='NA' lD='NA' lE='NA' lE_n=len(results[pcol][(results[cfg['freqcol']] < 0.01) | (results[cfg['freqcol']] > 0.99)]) lD_n=len(results[pcol][((results[cfg['freqcol']] >= 0.01) & (results[cfg['freqcol']] < 0.03)) | ((results[cfg['freqcol']] <= 0.99) & (results[cfg['freqcol']] > 0.97))]) lC_n=len(results[pcol][((results[cfg['freqcol']] >= 0.03) & (results[cfg['freqcol']] < 0.05)) | ((results[cfg['freqcol']] <= 0.97) & (results[cfg['freqcol']] > 0.95))]) lB_n=len(results[pcol][((results[cfg['freqcol']] >= 0.05) & (results[cfg['freqcol']] < 0.1)) | ((results[cfg['freqcol']] <= 0.95) & (results[cfg['freqcol']] > 0.9))]) lA_n=len(results[pcol][(results[cfg['freqcol']] >= 0.1) & (results[cfg['freqcol']] <= 0.9)]) if lE_n > 0: lE=np.median(scipy.chi2.ppf([1-x for x in results[pcol][(results[cfg['freqcol']] < 0.01) | (results[cfg['freqcol']] > 0.99)].tolist()], df=1))/scipy.chi2.ppf(0.5,1) if lD_n > 0: lD=np.median(scipy.chi2.ppf([1-x for x in results[pcol][((results[cfg['freqcol']] >= 0.01) & (results[cfg['freqcol']] < 0.03)) | ((results[cfg['freqcol']] <= 0.99) & (results[cfg['freqcol']] > 0.97))].tolist()], df=1))/scipy.chi2.ppf(0.5,1) if lC_n > 0: lC=np.median(scipy.chi2.ppf([1-x for x in results[pcol][((results[cfg['freqcol']] >= 0.03) & (results[cfg['freqcol']] < 0.05)) | ((results[cfg['freqcol']] <= 0.97) & (results[cfg['freqcol']] > 0.95))].tolist()], df=1))/scipy.chi2.ppf(0.5,1) if lB_n > 0: lB=np.median(scipy.chi2.ppf([1-x for x in results[pcol][((results[cfg['freqcol']] >= 0.05) & (results[cfg['freqcol']] < 0.1)) | ((results[cfg['freqcol']] <= 0.95) & (results[cfg['freqcol']] > 0.9))].tolist()], df=1))/scipy.chi2.ppf(0.5,1) if lA_n > 0: lA=np.median(scipy.chi2.ppf([1-x for x in results[pcol][(results[cfg['freqcol']] >= 0.1) & (results[cfg['freqcol']] <= 0.9)].tolist()], df=1))/scipy.chi2.ppf(0.5,1) print " genomic inflation (MAF >= 10%, n=" + str(lA_n) + ") = " + str(lA) print " genomic inflation (5% <= MAF < 10%, n=" + str(lB_n) + ") = " + str(lB) print " genomic inflation (3% <= MAF < 5%, n=" + str(lC_n) + ") = " + str(lC) print " genomic inflation (1% <= MAF < 3%, n=" + str(lD_n) + ") = " + str(lD) print " genomic inflation (MAF < 1%, n=" + str(lE_n) + ") = " + str(lE) a = np.array([]) b = np.array([]) c = np.array([]) results.sort_values(by=['logp'], inplace=True) if len(results[results['UGA___QQ_BIN___'] == 'E'].index) > 0: aa = -1 * np.log10(ro.r('ppoints(' + str(len(results[results['UGA___QQ_BIN___'] == 'E'].index)) + ')')) aa.sort() bb = results['logp'][results['UGA___QQ_BIN___'] == 'E'] #bb.sort() cc = results['UGA___QQ_BIN___'][results['UGA___QQ_BIN___'] == 'E'] a = np.append(a,aa) b = np.append(b,bb) c = np.append(c,cc) print " minimum p-value (MAF < 1%): " + str(np.min(results[pcol][results['UGA___QQ_BIN___'] == 'E'])) print " maximum -1*log10(p-value) (MAF < 1%): " + str(np.max(results['logp'][results['UGA___QQ_BIN___'] == 'E'])) if len(results[results['UGA___QQ_BIN___'] == 'D'].index) > 0: aa = -1 * np.log10(ro.r('ppoints(' + str(len(results[results['UGA___QQ_BIN___'] == 'D'].index)) + ')')) aa.sort() bb = results['logp'][results['UGA___QQ_BIN___'] == 'D'] #bb.sort() cc = results['UGA___QQ_BIN___'][results['UGA___QQ_BIN___'] == 'D'] a = np.append(a,aa) b = np.append(b,bb) c = np.append(c,cc) print " minimum p-value (1% <= MAF < 3%): " + str(np.min(results[pcol][results['UGA___QQ_BIN___'] == 'D'])) print " maximum -1*log10(p-value) (1% <= MAF < 3%): " + str(np.max(results['logp'][results['UGA___QQ_BIN___'] == 'D'])) if len(results[results['UGA___QQ_BIN___'] == 'C'].index) > 0: aa = -1 * np.log10(ro.r('ppoints(' + str(len(results[results['UGA___QQ_BIN___'] == 'C'].index)) + ')')) aa.sort() bb = results['logp'][results['UGA___QQ_BIN___'] == 'C'] #bb.sort() cc = results['UGA___QQ_BIN___'][results['UGA___QQ_BIN___'] == 'C'] a = np.append(a,aa) b = np.append(b,bb) c = np.append(c,cc) print " minimum p-value (3% <= MAF < 5%): " + str(np.min(results[pcol][results['UGA___QQ_BIN___'] == 'C'])) print " maximum -1*log10(p-value) (3% <= MAF < 5%): " + str(np.max(results['logp'][results['UGA___QQ_BIN___'] == 'C'])) if len(results[results['UGA___QQ_BIN___'] == 'B'].index) > 0: aa = -1 * np.log10(ro.r('ppoints(' + str(len(results[results['UGA___QQ_BIN___'] == 'B'].index)) + ')')) aa.sort() bb = results['logp'][results['UGA___QQ_BIN___'] == 'B'] #bb.sort() cc = results['UGA___QQ_BIN___'][results['UGA___QQ_BIN___'] == 'B'] a = np.append(a,aa) b = np.append(b,bb) c = np.append(c,cc) print " minimum p-value (5% <= MAF < 10%): " + str(np.min(results[pcol][results['UGA___QQ_BIN___'] == 'B'])) print " maximum -1*log10(p-value) (5% <= MAF < 10%): " + str(np.max(results['logp'][results['UGA___QQ_BIN___'] == 'B'])) if len(results[results['UGA___QQ_BIN___'] == 'A'].index) > 0: aa = -1 * np.log10(ro.r('ppoints(' + str(len(results[results['UGA___QQ_BIN___'] == 'A'].index)) + ')')) aa.sort() bb = results['logp'][results['UGA___QQ_BIN___'] == 'A'] #bb.sort() cc = results['UGA___QQ_BIN___'][results['UGA___QQ_BIN___'] == 'A'] a = np.append(a,aa) b = np.append(b,bb) c = np.append(c,cc) print " minimum p-value (MAF >= 10%): " + str(np.min(results[pcol][results['UGA___QQ_BIN___'] == 'A'])) print " maximum -1*log10(p-value) (MAF >= 10%): " + str(np.max(results['logp'][results['UGA___QQ_BIN___'] == 'A'])) ro.globalenv['df'] = ro.DataFrame({'a': ro.FloatVector(a), 'b': ro.FloatVector(b), 'UGA___QQ_BIN___': ro.StrVector(c)}) if cfg['ext'] == 'tiff': ggsave = 'ggsave(filename="%s",plot=gp,width=4,height=4,units="in",bg="white",compression="lzw",dpi=300)' % (cfg['out'] + '.' + pcol + '.qq_strat_freq.tiff') elif cfg['ext'] == 'eps': ggsave = 'ggsave(filename="%s",plot=gp,width=4,height=4,bg="white",horizontal=True)' % (cfg['out'] + '.' + pcol + '.qq_strat_freq.eps') else: ggsave = 'ggsave(filename="%s",plot=gp,width=4,height=4,bg="white")' % (cfg['out'] + '.' + pcol + '.qq_strat_freq.pdf') ro.r(""" gp<-ggplot(df, aes_string(x='a',y='b')) + geom_point(aes_string(color='UGA___QQ_BIN___'), size=2) + scale_colour_manual(values=c("E"="#a8ddb5", "D"="#7bccc4", "C"="#4eb3d3", "B"="#2b8cbe", "A"="#08589e"), labels=c("E"="MAF < 1%%","D"="1%% <= MAF < 3%%","C"="3%% <= MAF < 5%%","B"="5%% <= MAF < 10%%","A"="MAF >= 10%%")) + geom_abline(intercept=0, slope=1, alpha=0.5) + scale_x_discrete(expression(Expected~~-log[10](italic(p)))) + scale_y_discrete(expression(Observed~~-log[10](italic(p)))) + coord_fixed() + theme_bw(base_size = 12) + theme(axis.title.x = element_text(vjust=-0.5,size=14), axis.title.y = element_text(vjust=1,angle=90,size=14), legend.title = element_blank(), legend.key.height = unit(0.1,"in"), legend.text = element_text(size=5), legend.key = element_blank(), legend.justification = c(0,1), legend.position = c(0,1), panel.background = element_blank(), panel.border = element_blank(), panel.grid.minor = element_blank(), panel.grid.major = element_blank(), axis.line = element_line(colour="black"), axis.text = element_text(size=12)) %s """ % (ggsave)) if np.max(results['logp']) > cfg['crop']: print " generating cropped frequency stratified qq plot" ro.r('df$b[df$b > ' + str(cfg['crop']) + ']<-' + str(cfg['crop'])) ro.r('df$shape<-0') ro.r('df$shape[df$b == ' + str(cfg['crop']) + ']<-1') if cfg['ext'] == 'tiff': ggsave = 'ggsave(filename="%s",plot=gp,width=4,height=4,units="in",bg="white",compression="lzw",dpi=300)' % (cfg['out'] + '.' + pcol + '.qq_strat_freq.cropped.tiff') elif cfg['ext'] == 'eps': ggsave = 'ggsave(filename="%s",plot=gp,width=4,height=4,bg="white",horizontal=True)' % (cfg['out'] + '.' + pcol + '.qq_strat_freq.cropped.eps') else: ggsave = 'ggsave(filename="%s",plot=gp,width=4,height=4,bg="white")' % (cfg['out'] + '.' + pcol + '.qq_strat_freq.cropped.pdf') ro.r(""" gp<-ggplot(df, aes_string(x='a',y='b')) + geom_point(aes(shape=factor(shape), color=UGA_MAF), size=2) + scale_colour_manual(values=c("E"="#a8ddb5", "D"="#7bccc4", "C"="#4eb3d3", "B"="#2b8cbe", "A"="#08589e"), labels=c("E"="MAF < 1%%","D"="1%% <= MAF < 3%%","C"="3%% <= MAF < 5%%","B"="5%% <= MAF < 10%%","A"="MAF >= 10%%")) + geom_abline(intercept=0, slope=1, alpha=0.5) + scale_x_discrete(expression(Expected~~-log[10](italic(p)))) + scale_y_discrete(expression(Observed~~-log[10](italic(p)))) + coord_fixed() + theme_bw(base_size = 12) + guides(shape=FALSE) + theme(axis.title.x = element_text(vjust=-0.5,size=14), axis.title.y = element_text(vjust=1,angle=90,size=14), legend.title = element_blank(), legend.key.height = unit(0.1,"in"), legend.text = element_text(size=5), legend.key = element_blank(), legend.justification = c(0,1), legend.position = c(0,1), panel.background = element_blank(), panel.border = element_blank(), panel.grid.minor = element_blank(), panel.grid.major = element_blank(), axis.line = element_line(colour="black"), axis.text = element_text(size=12)) %s """ % (ggsave)) #if cfg['qq_strat_mac']: # print " generating minor allele count stratified qq plot" # # results['UGA_MAC'] = 'E' # results.loc[results[cfg['maccol']] < 5),'UGA_MAC'] = 'D' # results.loc[(results[cfg['maccol']] >= 0.03) & (results[cfg['maccol']] <= 0.97),'UGA_MAC'] = 'C' # results.loc[(results[cfg['maccol']] >= 0.05) & (results[cfg['maccol']] <= 0.95),'UGA_MAC'] = 'B' # results.loc[(results[cfg['maccol']] >= 0.1) & (results[cfg['maccol']] <= 0.9),'UGA_MAC'] = 'A' # lA='NA' # lB='NA' # lC='NA' # lD='NA' # lE='NA' # lE_n=len(results[pcol][(results[cfg['maccol']] < 0.01) | (results[cfg['maccol']] > 0.99)]) # lD_n=len(results[pcol][((results[cfg['maccol']] >= 0.01) & (results[cfg['maccol']] < 0.03)) | ((results[cfg['maccol']] <= 0.99) & (results[cfg['maccol']] > 0.97))]) # lC_n=len(results[pcol][((results[cfg['maccol']] >= 0.03) & (results[cfg['maccol']] < 0.05)) | ((results[cfg['maccol']] <= 0.97) & (results[cfg['maccol']] > 0.95))]) # lB_n=len(results[pcol][((results[cfg['maccol']] >= 0.05) & (results[cfg['maccol']] < 0.1)) | ((results[cfg['maccol']] <= 0.95) & (results[cfg['maccol']] > 0.9))]) # lA_n=len(results[pcol][(results[cfg['maccol']] >= 0.1) & (results[cfg['maccol']] <= 0.9)]) # if lE_n > 0: # lE=np.median(scipy.chi2.ppf([1-x for x in results[pcol][(results[cfg['maccol']] < 0.01) | (results[cfg['maccol']] > 0.99)].tolist()], df=1))/scipy.chi2.ppf(0.5,1) # if lD_n > 0: # lD=np.median(scipy.chi2.ppf([1-x for x in results[pcol][((results[cfg['maccol']] >= 0.01) & (results[cfg['maccol']] < 0.03)) | ((results[cfg['maccol']] <= 0.99) & (results[cfg['maccol']] > 0.97))].tolist()], df=1))/scipy.chi2.ppf(0.5,1) # if lC_n > 0: # lC=np.median(scipy.chi2.ppf([1-x for x in results[pcol][((results[cfg['maccol']] >= 0.03) & (results[cfg['maccol']] < 0.05)) | ((results[cfg['maccol']] <= 0.97) & (results[cfg['maccol']] > 0.95))].tolist()], df=1))/scipy.chi2.ppf(0.5,1) # if lB_n > 0: # lB=np.median(scipy.chi2.ppf([1-x for x in results[pcol][((results[cfg['maccol']] >= 0.05) & (results[cfg['maccol']] < 0.1)) | ((results[cfg['maccol']] <= 0.95) & (results[cfg['maccol']] > 0.9))].tolist()], df=1))/scipy.chi2.ppf(0.5,1) # if lA_n > 0: # lA=np.median(scipy.chi2.ppf([1-x for x in results[pcol][(results[cfg['maccol']] >= 0.1) & (results[cfg['maccol']] <= 0.9)].tolist()], df=1))/scipy.chi2.ppf(0.5,1) # print " genomic inflation (MAF >= 10%, n=" + str(lA_n) + ") = " + str(lA) # print " genomic inflation (5% <= MAF < 10%, n=" + str(lB_n) + ") = " + str(lB) # print " genomic inflation (3% <= MAF < 5%, n=" + str(lC_n) + ") = " + str(lC) # print " genomic inflation (1% <= MAF < 3%, n=" + str(lD_n) + ") = " + str(lD) # print " genomic inflation (MAF < 1%, n=" + str(lE_n) + ") = " + str(lE) # # a = np.array([]) # b = np.array([]) # c = np.array([]) # results.sort_values(by=['logp'], inplace=True) # if len(results[results['UGA_MAC'] == 'E'].index) > 0: # aa = -1 * np.log10(ro.r('ppoints(' + str(len(results[results['UGA_MAC'] == 'E'].index)) + ')')) # aa.sort() # bb = results['logp'][results['UGA_MAC'] == 'E'] # #bb.sort() # cc = results['UGA_MAC'][results['UGA_MAC'] == 'E'] # a = np.append(a,aa) # b = np.append(b,bb) # c = np.append(c,cc) # print " minimum p-value (MAF < 1%): " + str(np.min(results[pcol][results['UGA_MAC'] == 'E'])) # print " maximum -1*log10(p-value) (MAF < 1%): " + str(np.max(results['logp'][results['UGA_MAC'] == 'E'])) # if len(results[results['UGA_MAC'] == 'D'].index) > 0: # aa = -1 * np.log10(ro.r('ppoints(' + str(len(results[results['UGA_MAC'] == 'D'].index)) + ')')) # aa.sort() # bb = results['logp'][results['UGA_MAC'] == 'D'] # #bb.sort() # cc = results['UGA_MAC'][results['UGA_MAC'] == 'D'] # a = np.append(a,aa) # b = np.append(b,bb) # c = np.append(c,cc) # print " minimum p-value (1% <= MAF < 3%): " + str(np.min(results[pcol][results['UGA_MAC'] == 'D'])) # print " maximum -1*log10(p-value) (1% <= MAF < 3%): " + str(np.max(results['logp'][results['UGA_MAC'] == 'D'])) # if len(results[results['UGA_MAC'] == 'C'].index) > 0: # aa = -1 * np.log10(ro.r('ppoints(' + str(len(results[results['UGA_MAC'] == 'C'].index)) + ')')) # aa.sort() # bb = results['logp'][results['UGA_MAC'] == 'C'] # #bb.sort() # cc = results['UGA_MAC'][results['UGA_MAC'] == 'C'] # a = np.append(a,aa) # b = np.append(b,bb) # c = np.append(c,cc) # print " minimum p-value (3% <= MAF < 5%): " + str(np.min(results[pcol][results['UGA_MAC'] == 'C'])) # print " maximum -1*log10(p-value) (3% <= MAF < 5%): " + str(np.max(results['logp'][results['UGA_MAC'] == 'C'])) # if len(results[results['UGA_MAC'] == 'B'].index) > 0: # aa = -1 * np.log10(ro.r('ppoints(' + str(len(results[results['UGA_MAC'] == 'B'].index)) + ')')) # aa.sort() # bb = results['logp'][results['UGA_MAC'] == 'B'] # #bb.sort() # cc = results['UGA_MAC'][results['UGA_MAC'] == 'B'] # a = np.append(a,aa) # b = np.append(b,bb) # c = np.append(c,cc) # print " minimum p-value (5% <= MAF < 10%): " + str(np.min(results[pcol][results['UGA_MAC'] == 'B'])) # print " maximum -1*log10(p-value) (5% <= MAF < 10%): " + str(np.max(results['logp'][results['UGA_MAC'] == 'B'])) # if len(results[results['UGA_MAC'] == 'A'].index) > 0: # aa = -1 * np.log10(ro.r('ppoints(' + str(len(results[results['UGA_MAC'] == 'A'].index)) + ')')) # aa.sort() # bb = results['logp'][results['UGA_MAC'] == 'A'] # #bb.sort() # cc = results['UGA_MAC'][results['UGA_MAC'] == 'A'] # a = np.append(a,aa) # b = np.append(b,bb) # c = np.append(c,cc) # print " minimum p-value (MAF >= 10%): " + str(np.min(results[pcol][results['UGA_MAC'] == 'A'])) # print " maximum -1*log10(p-value) (MAF >= 10%): " + str(np.max(results['logp'][results['UGA_MAC'] == 'A'])) # # ro.globalenv['df'] = ro.DataFrame({'a': ro.FloatVector(a), 'b': ro.FloatVector(b), 'UGA_MAC': ro.StrVector(c)}) # # if cfg['ext'] == 'tiff': # ggsave = 'ggsave(filename="%s",plot=gp,width=4,height=4,units="in",bg="white",compression="lzw",dpi=300)' % (cfg['out'] + '.' + pcol + '.qq_strat.tiff') # elif cfg['ext'] == 'eps': # ggsave = 'ggsave(filename="%s",plot=gp,width=4,height=4,bg="white",horizontal=True)' % (cfg['out'] + '.' + pcol + '.qq_strat.eps') # else: # ggsave = 'ggsave(filename="%s",plot=gp,width=4,height=4,bg="white")' % (cfg['out'] + '.' + pcol + '.qq_strat.pdf') # ro.r(""" # gp<-ggplot(df, aes_string(x='a',y='b')) + # geom_point(aes_string(color='UGA_MAC'), size=2) + # scale_colour_manual(values=c("E"="#a8ddb5", "D"="#7bccc4", "C"="#4eb3d3", "B"="#2b8cbe", "A"="#08589e"), labels=c("E"="MAF < 1%%","D"="1%% <= MAF < 3%%","C"="3%% <= MAF < 5%%","B"="5%% <= MAF < 10%%","A"="MAF >= 10%%")) + # geom_abline(intercept=0, slope=1, alpha=0.5) + # scale_x_continuous(expression(Expected~~-log[10](italic(p)))) + # scale_y_continuous(expression(Observed~~-log[10](italic(p)))) + # theme_bw(base_size = 12) + # theme(axis.title.x = element_text(vjust=-0.5,size=14), axis.title.y = element_text(vjust=1,angle=90,size=14), legend.title = element_blank(), # legend.key.height = unit(0.1,"in"), legend.text = element_text(size=5), legend.key = element_blank(), legend.justification = c(0,1), # legend.position = c(0,1), panel.background = element_blank(), panel.border = element_blank(), panel.grid.minor = element_blank(), # panel.grid.major = element_blank(), axis.line = element_line(colour="black"), axis.text = element_text(size=12)) # %s # """ % (ggsave)) # # if np.max(results['logp']) > cfg['crop']: # print " generating cropped frequency stratified qq plot" # ro.r('df$b[df$b > ' + str(cfg['crop']) + ']<-' + str(cfg['crop'])) # ro.r('df$shape<-0') # ro.r('df$shape[df$b == ' + str(cfg['crop']) + ']<-1') # if cfg['ext'] == 'tiff': # ggsave = 'ggsave(filename="%s",plot=gp,width=4,height=4,units="in",bg="white",compression="lzw",dpi=300)' % (cfg['out'] + '.' + pcol + '.qq_strat.cropped.tiff') # elif cfg['ext'] == 'eps': # ggsave = 'ggsave(filename="%s",plot=gp,width=4,height=4,bg="white",horizontal=True)' % (cfg['out'] + '.' + pcol + '.qq_strat.cropped.eps') # else: # ggsave = 'ggsave(filename="%s",plot=gp,width=4,height=4,bg="white")' % (cfg['out'] + '.' + pcol + '.qq_strat.cropped.pdf') # ro.r(""" # gp<-ggplot(df, aes_string(x='a',y='b')) + # geom_point(aes(shape=factor(shape), color=UGA_MAC), size=2) + # scale_colour_manual(values=c("E"="#a8ddb5", "D"="#7bccc4", "C"="#4eb3d3", "B"="#2b8cbe", "A"="#08589e"), labels=c("E"="MAF < 1%%","D"="1%% <= MAF < 3%%","C"="3%% <= MAF < 5%%","B"="5%% <= MAF < 10%%","A"="MAF >= 10%%")) + # geom_abline(intercept=0, slope=1, alpha=0.5) + # scale_x_continuous(expression(Expected~~-log[10](italic(p)))) + # scale_y_continuous(expression(Observed~~-log[10](italic(p)))) + # theme_bw(base_size = 12) + # guides(shape=FALSE) + # theme(axis.title.x = element_text(vjust=-0.5,size=14), axis.title.y = element_text(vjust=1,angle=90,size=14), legend.title = element_blank(), # legend.key.height = unit(0.1,"in"), legend.text = element_text(size=5), legend.key = element_blank(), legend.justification = c(0,1), # legend.position = c(0,1), panel.background = element_blank(), panel.border = element_blank(), panel.grid.minor = element_blank(), # panel.grid.major = element_blank(), axis.line = element_line(colour="black"), axis.text = element_text(size=12)) # %s # """ % (ggsave)) if cfg['mht']: print " generating standard manhattan plot" print " minimum p-value: " + str(np.min(results[pcol])) print " maximum -1*log10(p-value): " + str(np.max(results['logp'])) if cfg['gc'] and l > 1: print " adjusting p-values for genomic inflation for p-value column " + pcol results[pcol]=2 * scipy.norm.cdf(-1 * np.abs(scipy.norm.ppf(0.5*results[pcol]) / math.sqrt(l))) print " minimum post-gc adjustment p-value: " + str(np.min(results[pcol])) print " maximum post-gc adjustment -1*log10(p-value): " + str(np.max(results['logp'])) else: print " skipping genomic inflation correction" print " calculating genomic positions" results.sort_values(by=[cfg['chrcol'],cfg['bpcol']], inplace=True) ticks = [] lastbase = 0 results['gpos'] = 0 nchr = len(list(np.unique(results[cfg['chrcol']].values))) chrs = np.unique(results[cfg['chrcol']].values) if cfg['color']: colours = ["#08306B","#41AB5D","#000000","#F16913","#3F007D","#EF3B2C","#08519C","#238B45","#252525","#D94801","#54278F","#CB181D","#2171B5","#006D2C","#525252","#A63603","#6A51A3","#A50F15","#4292C6","#00441B","#737373","#7F2704","#807DBA","#67000D"] else: colours = ["#08589e","#4eb3d3","#08589e","#4eb3d3","#08589e","#4eb3d3","#08589e","#4eb3d3","#08589e","#4eb3d3","#08589e","#4eb3d3","#08589e","#4eb3d3","#08589e","#4eb3d3","#08589e","#4eb3d3","#08589e","#4eb3d3","#08589e","#4eb3d3","#08589e","#4eb3d3"] if nchr == 1: results['gpos'] = results[cfg['bpcol']] results['colours'] = "#08589e" if results['gpos'].max() - results['gpos'].min() <= 1000: ticks = [x for x in range(results['gpos'].min(),results['gpos'].max()) if x % 100 == 0] elif results['gpos'].max() - results['gpos'].min() <= 10000: ticks = [x for x in range(results['gpos'].min(),results['gpos'].max()) if x % 1000 == 0] elif results['gpos'].max() - results['gpos'].min() <= 100000: ticks = [x for x in range(results['gpos'].min(),results['gpos'].max()) if x % 10000 == 0] elif results['gpos'].max() - results['gpos'].min() <= 200000: ticks = [x for x in range(results['gpos'].min(),results['gpos'].max()) if x % 20000 == 0] elif results['gpos'].max() - results['gpos'].min() <= 300000: ticks = [x for x in range(results['gpos'].min(),results['gpos'].max()) if x % 30000 == 0] elif results['gpos'].max() - results['gpos'].min() <= 400000: ticks = [x for x in range(results['gpos'].min(),results['gpos'].max()) if x % 40000 == 0] elif results['gpos'].max() - results['gpos'].min() <= 500000: ticks = [x for x in range(results['gpos'].min(),results['gpos'].max()) if x % 50000 == 0] elif results['gpos'].max() - results['gpos'].min() <= 600000: ticks = [x for x in range(results['gpos'].min(),results['gpos'].max()) if x % 60000 == 0] elif results['gpos'].max() - results['gpos'].min() <= 700000: ticks = [x for x in range(results['gpos'].min(),results['gpos'].max()) if x % 70000 == 0] elif results['gpos'].max() - results['gpos'].min() <= 800000: ticks = [x for x in range(results['gpos'].min(),results['gpos'].max()) if x % 80000 == 0] elif results['gpos'].max() - results['gpos'].min() <= 900000: ticks = [x for x in range(results['gpos'].min(),results['gpos'].max()) if x % 90000 == 0] elif results['gpos'].max() - results['gpos'].min() <= 1000000: ticks = [x for x in range(results['gpos'].min(),results['gpos'].max()) if x % 100000 == 0] elif results['gpos'].max() - results['gpos'].min() <= 10000000: ticks = [x for x in range(results['gpos'].min(),results['gpos'].max()) if x % 1000000 == 0] elif results['gpos'].max() - results['gpos'].min() <= 100000000: ticks = [x for x in range(results['gpos'].min(),results['gpos'].max()) if x % 10000000 == 0] elif results['gpos'].max() - results['gpos'].min() > 100000000: ticks = [x for x in range(results['gpos'].min(),results['gpos'].max()) if x % 25000000 == 0] else: results['colours'] = "#000000" for i in range(len(chrs)): print " processed chromosome " + str(int(chrs[i])) if i == 0: results.loc[results[cfg['chrcol']] == chrs[i],'gpos'] = results.loc[results[cfg['chrcol']] == chrs[i],cfg['bpcol']] else: lastbase = lastbase + results.loc[results[cfg['chrcol']] == chrs[i-1],cfg['bpcol']].iloc[-1] results.loc[results[cfg['chrcol']] == chrs[i],'gpos'] = (results.loc[results[cfg['chrcol']] == chrs[i],cfg['bpcol']]) + lastbase if results.loc[results[cfg['chrcol']] == chrs[i]].shape[0] > 1: ticks.append(results.loc[results[cfg['chrcol']] == chrs[i],'gpos'].iloc[0] + (results.loc[results[cfg['chrcol']] == chrs[i],'gpos'].iloc[-1] - results.loc[results[cfg['chrcol']] == chrs[i],'gpos'].iloc[0])/2) else: ticks.append(results.loc[results[cfg['chrcol']] == chrs[i],'gpos'].iloc[0]) results.loc[results[cfg['chrcol']] == chrs[i],'colours'] = colours[int(chrs[i])] results['logp'] = -1 * np.log10(results[pcol]) if results.shape[0] >= 1000000: sig = 5.4e-8 else: sig = 0.05 / results.shape[0] print " significance level set to p-value = " + str(sig) + " (-1*log10(p-value) = " + str(-1 * np.log10(sig)) + ")" print " " + str(len(results[pcol][results[pcol] <= sig])) + " genome wide significant variants" chr = results[cfg['chrcol']][0] maxy=int(max(np.ceil(-1 * np.log10(sig)),np.ceil(results['logp'].max()))) if maxy > 20: y_breaks = range(0,maxy,5) y_labels = range(0,maxy,5) else: y_breaks = range(0,maxy) y_labels = range(0,maxy) ro.globalenv['df'] = ro.DataFrame({'gpos': ro.FloatVector(results['gpos']), 'logp': ro.FloatVector(results['logp']), 'colours': ro.FactorVector(results['colours'])}) ro.globalenv['ticks'] = ro.FloatVector(ticks) ro.globalenv['labels'] = ro.Vector(["{:,}".format(x/1000) for x in ticks]) ro.globalenv['colours'] = ro.StrVector(colours) ro.globalenv['chrs'] = ro.FloatVector(chrs) print " generating manhattan plot" if cfg['ext'] == 'tiff': ggsave = 'ggsave(filename="%s",plot=gp,width=16,height=4,units="in",bg="white",compression="lzw",dpi=300)' % (cfg['out'] + '.' + pcol + '.mht.tiff') elif cfg['ext'] == 'eps': ggsave = 'ggsave(filename="%s",plot=gp,width=16,height=4,bg="white",horizontal=True)' % (cfg['out'] + '.' + pcol + '.mht.eps') else: ggsave = 'ggsave(filename="%s",plot=gp,width=16,height=4,bg="white")' % (cfg['out'] + '.' + pcol + '.mht.pdf') if nchr == 1: ro.r(""" gp<-ggplot(df, aes_string(x='gpos',y='logp')) + geom_hline(yintercept = -1 * log10(%g),colour="#B8860B", linetype=5, size = 0.25) + geom_point(size=1.5) + scale_x_continuous(expression(Chromosome~~%d~~(kb))'),breaks=ticks,labels=labels) + \ scale_y_continuous(expression(-log[10](italic(p))),breaks=seq(0,%d,1),limits=c(0,%d)) + \ theme_bw(base_size = 8) + \ theme(axis.title.x = element_text(vjust=-0.5,size=14), axis.title.y = element_text(vjust=1,angle=90,size=14), panel.background = element_blank(), panel.border = element_blank(), panel.grid.minor = element_blank(), panel.grid.major = element_blank(), axis.line = element_line(colour="black"), axis.title = element_text(size=10), axis.text = element_text(size=12), legend.position = 'none') %s """ % (sig, chr, maxy, maxy, ggsave)) else: ro.r(""" gp = ggplot(df, aes_string(x='gpos',y='logp',colour='colours')) + geom_hline(yintercept = -1 * log10(%g),colour="#B8860B", linetype=5, size = 0.25) + geom_point(size=1.5) + scale_colour_manual(values=colours) + scale_x_continuous(expression(Chromosome),breaks=ticks,labels=chrs) + scale_y_continuous(expression(-log[10](italic(p))),breaks=seq(0,%d,1),limits=c(0,%d)) + theme_bw(base_size = 8) + theme(axis.title.x = element_text(vjust=-0.5,size=14), axis.title.y = element_text(vjust=1,angle=90,size=14), panel.background = element_blank(), panel.border = element_blank(), panel.grid.minor = element_blank(), panel.grid.major = element_blank(), axis.line = element_line(colour="black"), axis.title = element_text(size=10), axis.text = element_text(size=12), legend.position = 'none') %s """ % (sig, maxy, maxy, ggsave)) if maxy > cfg['crop']: maxy = cfg['crop'] ro.r('df$logp[df$logp > ' + str(cfg['crop']) + ']<-' + str(cfg['crop'])) ro.r('df$shape<-0') ro.r('df$shape[df$logp == ' + str(cfg['crop']) + ']<-1') print " generating cropped manhattan plot" if cfg['ext'] == 'tiff': ggsave = 'ggsave(filename="%s",plot=gp,width=16,height=4,units="in",bg="white",compression="lzw",dpi=300)' % (cfg['out'] + '.' + pcol + '.mht.cropped.tiff') elif cfg['ext'] == 'eps': ggsave = 'ggsave(filename="%s",plot=gp,width=16,height=4,bg="white",horizontal=True)' % (cfg['out'] + '.' + pcol + '.mht.cropped.eps') else: ggsave = 'ggsave(filename="%s",plot=gp,width=16,height=4,bg="white")' % (cfg['out'] + '.' + pcol + '.mht.cropped.pdf') if nchr == 1: ro.r(""" gp<-ggplot(df, aes_string(x='gpos',y='logp')) + geom_hline(yintercept = -1 * log10(%g),colour="#B8860B", linetype=5, size = 0.25) + geom_point(aes(shape=factor(shape)),size=1.5) + scale_x_continuous(expression(Chromosome~~%d~~(kb))'),breaks=ticks,labels=labels) + scale_y_continuous(expression(-log[10](italic(p))),breaks=seq(0,%d,1),limits=c(0,%d)) + theme_bw(base_size = 8) + theme(axis.title.x = element_text(vjust=-0.5,size=14), axis.title.y = element_text(vjust=1,angle=90,size=14), panel.background = element_blank(), panel.border = element_blank(), panel.grid.minor = element_blank(), panel.grid.major = element_blank(), axis.line = element_line(colour="black"), axis.title = element_text(size=10), axis.text = element_text(size=12), legend.position = 'none') %s """ % (sig, chr, maxy, maxy, ggsave)) else: ro.r(""" gp = ggplot(df, aes_string(x='gpos',y='logp',colour='colours')) + geom_hline(yintercept = -1 * log10(%g),colour="#B8860B", linetype=5, size = 0.25) + geom_point(aes(shape=factor(shape)),size=1.5) + scale_colour_manual(values=colours) + scale_x_continuous(expression(Chromosome),breaks=ticks,labels=chrs) + scale_y_continuous(expression(-log[10](italic(p))),breaks=seq(0,%d,1),limits=c(0,%d)) + theme_bw(base_size = 8) + theme(axis.title.x = element_text(vjust=-0.5,size=14), axis.title.y = element_text(vjust=1,angle=90,size=14), panel.background = element_blank(), panel.border = element_blank(), panel.grid.minor = element_blank(), panel.grid.major = element_blank(), axis.line = element_line(colour="black"), axis.title = element_text(size=8), axis.text = element_text(size=12), legend.position = 'none') %s """ % (sig, maxy, maxy, ggsave)) print "process complete" return 0
def RunSnvplot(args): cfg = Parse.generate_snvplot_cfg(args) Parse.print_snvplot_options(cfg) if not cfg['debug']: logging.disable(logging.CRITICAL) ro.r('suppressMessages(library(ggplot2))') ro.r('suppressMessages(library(grid))') ro.r('suppressMessages(library(RColorBrewer))') handle=pysam.TabixFile(filename=cfg['file'],parser=pysam.asVCF()) header = [x for x in handle.header] skip_rows = len(header)-1 cols = header[-1].split() pcols = cfg['pcol'].split(',') cols_extract = [cfg['chrcol'],cfg['bpcol']] + pcols if cfg['qq_strat_freq']: if cfg['freqcol'] not in cols: print Process.Error("frequency column " + cfg['freqcol'] + " not found, unable to proceed with frequency stratified plots").out return 1 else: cols_extract = cols_extract + [cfg['freqcol']] print "frequency column " + cfg['freqcol'] + " found" if cfg['qq_strat_mac']: if cfg['maccol'] not in cols: print Process.Error("minor allele count column " + cfg['maccol'] + " not found, unable to proceed with minor allele count stratified plots").out return 1 else: cols_extract = cols_extract + [cfg['maccol']] print "minor allele count column " + cfg['maccol'] + " found" print "importing data" r = pd.read_table(cfg['file'],sep='\t',skiprows=skip_rows,usecols=cols_extract,compression='gzip') print str(r.shape[0]) + " total variants found" for pcol in pcols: print "plotting p-values for column " + pcol + " ..." extract_cols = [cfg['chrcol'],cfg['bpcol'],pcol] if cfg['freqcol'] in r: extract_cols = extract_cols + [cfg['freqcol']] if cfg['maccol'] in r: extract_cols = extract_cols + [cfg['maccol']] results = r[extract_cols] results.dropna(inplace=True) results = results[(results[pcol] > 0) & (results[pcol] <= 1)].reset_index(drop=True) print " " + str(results.shape[0]) + " variants with plottable p-values" results['logp'] = -1 * np.log10(results[pcol]) + 0.0 ro.globalenv['results'] = results l = np.median(scipy.chi2.ppf([1-x for x in results[pcol].tolist()], df=1))/scipy.chi2.ppf(0.5,1) # in R: median(qchisq(results$p, df=1, lower.tail=FALSE))/qchisq(0.5,1) print " genomic inflation (all variants) = " + str(l) if cfg['qq']: print " generating standard qq plot" print " minimum p-value: " + str(np.min(results[pcol])) a = -1 * np.log10(ro.r('ppoints(' + str(len(results.index)) + ')')) a.sort() results.sort_values(by=['logp'], inplace=True) print " maximum -1*log10(p-value): " + str(np.max(results['logp'])) ci_upper = -1 * np.log10(scipy.beta.ppf(0.95, range(1,len(results[pcol]) + 1), range(len(results[pcol]),0,-1))) ci_upper.sort() ci_lower = -1 * np.log10(scipy.beta.ppf(0.05, range(1,len(results[pcol]) + 1), range(len(results[pcol]),0,-1))) ci_lower.sort() ro.globalenv['df'] = ro.DataFrame({'a': ro.FloatVector(a), 'b': ro.FloatVector(results['logp']), 'ci_lower': ro.FloatVector(ci_lower), 'ci_upper': ro.FloatVector(ci_upper)}) dftext_label = 'lambda %~~% ' + str(round(l,3)) ro.globalenv['dftext'] = ro.DataFrame({'x': ro.r('Inf'), 'y': 0.5, 'lab': dftext_label}) if cfg['ext'] == 'tiff': ggsave = 'ggsave(filename="%s",plot=pp,width=4,height=4,units="in",bg="white",compression="lzw",dpi=300)' % (cfg['out'] + '.' + pcol + '.qq.tiff') elif cfg['ext'] == 'png': ggsave = 'ggsave(filename="%s",plot=pp,width=4,height=4,units="in",bg="white",dpi=300)' % (cfg['out'] + '.' + pcol + '.qq.png') elif cfg['ext'] == 'eps': ggsave = 'ggsave(filename="%s",plot=pp,width=4,height=4,bg="white")' % (cfg['out'] + '.' + pcol + '.qq.eps') else: ggsave = 'ggsave(filename="%s",plot=pp,width=4,height=4,bg="white")' % (cfg['out'] + '.' + pcol + '.qq.pdf') ro.r(""" gp<-ggplot(df) pp<-gp + aes_string(x='a',y='b') + geom_ribbon(aes_string(x='a',ymin='ci_lower',ymax='ci_upper'), data=df, alpha=0.25, fill='black') + geom_point(size=2) + geom_abline(intercept=0, slope=1, alpha=0.5) + scale_x_discrete(expression(Expected~~-log[10](italic(p)))) + scale_y_discrete(expression(Observed~~-log[10](italic(p)))) + coord_fixed() + theme_bw(base_size = 12) + geom_text(aes_string(x='x', y='y', label='lab'), data = dftext, colour="black", vjust=0, hjust=1, size = 4, parse=TRUE) + theme(axis.title.x = element_text(vjust=-0.5,size=14), axis.title.y = element_text(vjust=1,angle=90,size=14), legend.position = 'none', panel.background = element_blank(), panel.border = element_blank(), panel.grid.minor = element_blank(), panel.grid.major = element_blank(), axis.line = element_line(colour="black"), axis.text = element_text(size=12)) %s """ % (ggsave)) if np.max(results['logp']) > cfg['crop']: print " generating cropped standard qq plot" ro.r('df$b[df$b > ' + str(cfg['crop']) + ']<-' + str(cfg['crop'])) ro.r('df$shape<-0') ro.r('df$shape[df$b == ' + str(cfg['crop']) + ']<-1') if cfg['ext'] == 'tiff': ggsave = 'ggsave(filename="%s",plot=pp,width=4,height=4,units="in",bg="white",compression="lzw",dpi=300)' % (cfg['out'] + '.' + pcol + '.qq.cropped.tiff') elif cfg['ext'] == 'png': ggsave = 'ggsave(filename="%s",plot=pp,width=4,height=4,units="in",bg="white",dpi=300)' % (cfg['out'] + '.' + pcol + '.qq.cropped.png') elif cfg['ext'] == 'eps': ggsave = 'ggsave(filename="%s",plot=pp,width=4,height=4,bg="white")' % (cfg['out'] + '.' + pcol + '.qq.cropped.eps') else: ggsave = 'ggsave(filename="%s",plot=pp,width=4,height=4,bg="white")' % (cfg['out'] + '.' + pcol + '.qq.cropped.pdf') ro.r(""" gp<-ggplot(df) pp<-gp + aes_string(x='a',y='b') + geom_ribbon(aes_string(x='a',ymin='ci_lower',ymax='ci_upper'), data=df, alpha=0.25, fill='black') + geom_point(aes(shape=factor(shape)),size=2) + geom_abline(intercept=0, slope=1, alpha=0.5) + scale_x_discrete(expression(Expected~~-log[10](italic(p)))) + scale_y_discrete(expression(Observed~~-log[10](italic(p)))) + coord_fixed() + theme_bw(base_size = 12) + geom_text(aes_string(x='x', y='y', label='lab'), data = dftext, colour="black", vjust=0, hjust=1, size = 4, parse=TRUE) + theme(axis.title.x = element_text(vjust=-0.5,size=14), axis.title.y = element_text(vjust=1,angle=90,size=14), legend.position = 'none', panel.background = element_blank(), panel.border = element_blank(), panel.grid.minor = element_blank(), panel.grid.major = element_blank(), axis.line = element_line(colour="black"), axis.text = element_text(size=12)) %s """ % (ggsave)) def ppoints(n, a): try: n = np.float(len(n)) except TypeError: n = np.float(n) return (np.arange(n) + 1 - a)/(n + 1 - 2*a) if cfg['qq_strat_freq']: print " generating frequency stratified qq plot" strat_ticks = np.sort([np.float(x) for x in cfg['freq_ticks'].split(',')]) results['UGA___QQ_BIN___'] = 0 for i in xrange(len(strat_ticks)): results.loc[(results[cfg['freqcol']] >= strat_ticks[i]) & (results[cfg['freqcol']] <= 1-strat_ticks[i]),'UGA___QQ_BIN___'] = i+1 bin_values = results['UGA___QQ_BIN___'].value_counts() for i in xrange(len(strat_ticks)+1): if i not in bin_values.index: bin_values[i] = 0 counts = pd.DataFrame(bin_values) counts['lambda'] = np.nan results['description'] = 'NA' for i in xrange(len(strat_ticks)+1): if counts.loc[i,'UGA___QQ_BIN___'] > 0: counts.loc[i,'lambda'] = np.median(scipy.chi2.ppf([1-x for x in results[pcol][results['UGA___QQ_BIN___'] == i].tolist()], df=1))/scipy.chi2.ppf(0.5,1) else: counts.loc[i,'lambda'] = np.nan if i == 0: results.loc[results['UGA___QQ_BIN___'] == i,'description'] = "(0," + str(strat_ticks[i]) + ") ~" + str(round(counts.loc[i,'lambda'],3)) print " MAF (0," + str(strat_ticks[i]) + "): n=" + str(np.int(counts.loc[i,'UGA___QQ_BIN___'])) + ", lambda=" + str(counts.loc[i,'lambda']) elif i < len(strat_ticks): results.loc[results['UGA___QQ_BIN___'] == i,'description'] = "[" + str(strat_ticks[i-1]) + "," + str(strat_ticks[i]) + ") ~" + str(round(counts.loc[i,'lambda'],3)) print " MAF [" + str(strat_ticks[i-1]) + "," + str(strat_ticks[i]) + "): n=" + str(np.int(counts.loc[i,'UGA___QQ_BIN___'])) + ", lambda=" + str(counts.loc[i,'lambda']) else: results.loc[results['UGA___QQ_BIN___'] == i,'description'] = "[" + str(strat_ticks[i-1]) + ",0.5] ~" + str(round(counts.loc[i,'lambda'],3)) print " MAF [" + str(strat_ticks[i-1]) + ",0.5]: n=" + str(np.int(counts.loc[i,'UGA___QQ_BIN___'])) + ", lambda=" + str(counts.loc[i,'lambda']) results.sort_values(['UGA___QQ_BIN___','logp'],inplace=True) results['expected'] = 0 for i in counts.index: if counts.loc[i,'UGA___QQ_BIN___'] > 0: results.loc[results['UGA___QQ_BIN___'] == i,'expected'] = np.sort(-1 * np.log10(ppoints(len(results.loc[results['UGA___QQ_BIN___'] == i,'expected']),0))) ro.globalenv['df'] = ro.DataFrame({'expected': ro.FloatVector(results['expected']), 'logp': ro.FloatVector(results['logp']), 'UGA___QQ_BIN___': ro.IntVector(results['UGA___QQ_BIN___']), 'description': ro.StrVector(results['description'])}) ro.r("df<-df[order(df$UGA___QQ_BIN___),]") ro.r("df$description<-ordered(df$description,levels=unique(df$description))") if cfg['ext'] == 'tiff': ggsave = 'ggsave(filename="%s",plot=gp,width=4,height=4,units="in",bg="white",compression="lzw",dpi=300)' % (cfg['out'] + '.' + pcol + '.qq_strat_freq.tiff') elif cfg['ext'] == 'png': ggsave = 'ggsave(filename="%s",plot=gp,width=4,height=4,units="in",bg="white",dpi=300)' % (cfg['out'] + '.' + pcol + '.qq_strat_freq.png') elif cfg['ext'] == 'eps': ggsave = 'ggsave(filename="%s",plot=gp,width=4,height=4,bg="white")' % (cfg['out'] + '.' + pcol + '.qq_strat_freq.eps') else: ggsave = 'ggsave(filename="%s",plot=gp,width=4,height=4,bg="white")' % (cfg['out'] + '.' + pcol + '.qq_strat_freq.pdf') ro.r(""" gp<-ggplot(df, aes_string(x='expected',y='logp')) + geom_point(aes_string(color='description'), size=2) + scale_colour_manual(values=colorRampPalette(brewer.pal(9,"Blues"))(length(unique(df$description))+2)[3:(length(unique(df$description))+2)]) + geom_abline(intercept=0, slope=1, alpha=0.5) + scale_x_discrete(expression(Expected~~-log[10](italic(p)))) + scale_y_discrete(expression(Observed~~-log[10](italic(p)))) + coord_fixed() + theme_bw(base_size = 12) + theme(axis.title.x = element_text(vjust=-0.5,size=14), axis.title.y = element_text(vjust=1,angle=90,size=14), legend.title = element_blank(), legend.key.height = unit(0.1,"in"), legend.text = element_text(size=6), legend.key = element_blank(), legend.justification = c(0,1), legend.position = c(0,1), panel.background = element_blank(), panel.border = element_blank(), panel.grid.minor = element_blank(), panel.grid.major = element_blank(), axis.line = element_line(colour="black"), axis.text = element_text(size=12)) %s """ % (ggsave)) if np.max(results['logp']) > cfg['crop']: print " generating cropped frequency stratified qq plot" ro.r('df$logp[df$logp > ' + str(cfg['crop']) + ']<-' + str(cfg['crop'])) ro.r('df$shape<-0') ro.r('df$shape[df$logp == ' + str(cfg['crop']) + ']<-1') if cfg['ext'] == 'tiff': ggsave = 'ggsave(filename="%s",plot=gp,width=4,height=4,units="in",bg="white",compression="lzw",dpi=300)' % (cfg['out'] + '.' + pcol + '.qq_strat_freq.cropped.tiff') elif cfg['ext'] == 'png': ggsave = 'ggsave(filename="%s",plot=gp,width=4,height=4,units="in",bg="white",dpi=300)' % (cfg['out'] + '.' + pcol + '.qq_strat_freq.cropped.png') elif cfg['ext'] == 'eps': ggsave = 'ggsave(filename="%s",plot=gp,width=4,height=4,bg="white")' % (cfg['out'] + '.' + pcol + '.qq_strat_freq.cropped.eps') else: ggsave = 'ggsave(filename="%s",plot=gp,width=4,height=4,bg="white")' % (cfg['out'] + '.' + pcol + '.qq_strat_freq.cropped.pdf') ro.r(""" gp<-ggplot(df, aes_string(x='expected',y='logp')) + geom_point(aes(shape=factor(shape), color=description), size=2) + scale_colour_manual(values=colorRampPalette(brewer.pal(9,"Blues"))(length(unique(df$description))+2)[3:(length(unique(df$description))+2)]) + geom_abline(intercept=0, slope=1, alpha=0.5) + scale_x_discrete(expression(Expected~~-log[10](italic(p)))) + scale_y_discrete(expression(Observed~~-log[10](italic(p)))) + coord_fixed() + theme_bw(base_size = 12) + guides(shape=FALSE) + theme(axis.title.x = element_text(vjust=-0.5,size=14), axis.title.y = element_text(vjust=1,angle=90,size=14), legend.title = element_blank(), legend.key.height = unit(0.1,"in"), legend.text = element_text(size=6), legend.key = element_blank(), legend.justification = c(0,1), legend.position = c(0,1), panel.background = element_blank(), panel.border = element_blank(), panel.grid.minor = element_blank(), panel.grid.major = element_blank(), axis.line = element_line(colour="black"), axis.text = element_text(size=12)) %s """ % (ggsave)) if cfg['qq_strat_mac']: print " generating minor allele count stratified qq plot" strat_ticks = np.sort([np.float(x) for x in cfg['mac_ticks'].split(',')]) results['UGA___QQ_BIN___'] = 0 for i in xrange(len(strat_ticks)): results.loc[results[cfg['maccol']] >= strat_ticks[i],'UGA___QQ_BIN___'] = i+1 bin_values = results['UGA___QQ_BIN___'].value_counts() for i in xrange(len(strat_ticks)+1): if i not in bin_values.index: bin_values[i] = 0 counts = pd.DataFrame(bin_values) counts['lambda'] = 0 results['description'] = 'NA' for i in np.sort(counts.index): if counts.loc[i,'UGA___QQ_BIN___'] > 0: counts.loc[i,'lambda'] = np.median(scipy.chi2.ppf([1-x for x in results[pcol][results['UGA___QQ_BIN___'] == i].tolist()], df=1))/scipy.chi2.ppf(0.5,1) else: counts.loc[i,'lambda'] = np.nan if i == 0: results.loc[results['UGA___QQ_BIN___'] == i,'description'] = "(0," + str(int(strat_ticks[i])) + ") ~" + str(round(counts.loc[i,'lambda'],3)) print " MAC (0," + str(int(strat_ticks[i])) + "): n=" + str(np.int(counts.loc[i,'UGA___QQ_BIN___'])) + ", lambda=" + str(counts.loc[i,'lambda']) elif i < len(strat_ticks): results.loc[results['UGA___QQ_BIN___'] == i,'description'] = "[" + str(int(strat_ticks[i-1])) + "," + str(int(strat_ticks[i])) + ") ~" + str(round(counts.loc[i,'lambda'],3)) print " MAC [" + str(int(strat_ticks[i-1])) + "," + str(int(strat_ticks[i])) + "): n=" + str(np.int(counts.loc[i,'UGA___QQ_BIN___'])) + ", lambda=" + str(counts.loc[i,'lambda']) else: results.loc[results['UGA___QQ_BIN___'] == i,'description'] = "[" + str(int(strat_ticks[i-1])) + ",...] ~" + str(round(counts.loc[i,'lambda'],3)) print " MAC [" + str(int(strat_ticks[i-1])) + ",...]: n=" + str(np.int(counts.loc[i,'UGA___QQ_BIN___'])) + ", lambda=" + str(counts.loc[i,'lambda']) results.sort_values(['UGA___QQ_BIN___','logp'],inplace=True) results['expected'] = 0 for i in counts.index: results.loc[results['UGA___QQ_BIN___'] == i,'expected'] = np.sort(-1 * np.log10(ppoints(len(results.loc[results['UGA___QQ_BIN___'] == i,'expected']),0))) ro.globalenv['df'] = ro.DataFrame({'expected': ro.FloatVector(results['expected']), 'logp': ro.FloatVector(results['logp']), 'UGA___QQ_BIN___': ro.IntVector(results['UGA___QQ_BIN___']), 'description': ro.StrVector(results['description'])}) ro.r("df<-df[order(df$UGA___QQ_BIN___),]") ro.r("df$description<-ordered(df$description,levels=unique(df$description))") if cfg['ext'] == 'tiff': ggsave = 'ggsave(filename="%s",plot=gp,width=4,height=4,units="in",bg="white",compression="lzw",dpi=300)' % (cfg['out'] + '.' + pcol + '.qq_strat_mac.tiff') elif cfg['ext'] == 'png': ggsave = 'ggsave(filename="%s",plot=gp,width=4,height=4,units="in",bg="white",dpi=300)' % (cfg['out'] + '.' + pcol + '.qq_strat_mac.png') elif cfg['ext'] == 'eps': ggsave = 'ggsave(filename="%s",plot=gp,width=4,height=4,bg="white")' % (cfg['out'] + '.' + pcol + '.qq_strat_mac.eps') else: ggsave = 'ggsave(filename="%s",plot=gp,width=4,height=4,bg="white")' % (cfg['out'] + '.' + pcol + '.qq_strat_mac.pdf') ro.r(""" gp<-ggplot(df, aes_string(x='expected',y='logp')) + geom_point(aes_string(color='description'), size=2) + scale_colour_manual(values=colorRampPalette(brewer.pal(9,"Blues"))(length(unique(df$description))+2)[3:(length(unique(df$description))+2)]) + geom_abline(intercept=0, slope=1, alpha=0.5) + scale_x_discrete(expression(Expected~~-log[10](italic(p)))) + scale_y_discrete(expression(Observed~~-log[10](italic(p)))) + coord_fixed() + theme_bw(base_size = 12) + theme(axis.title.x = element_text(vjust=-0.5,size=14), axis.title.y = element_text(vjust=1,angle=90,size=14), legend.title = element_blank(), legend.key.height = unit(0.1,"in"), legend.text = element_text(size=6), legend.key = element_blank(), legend.justification = c(0,1), legend.position = c(0,1), panel.background = element_blank(), panel.border = element_blank(), panel.grid.minor = element_blank(), panel.grid.major = element_blank(), axis.line = element_line(colour="black"), axis.text = element_text(size=12)) %s """ % (ggsave)) if np.max(results['logp']) > cfg['crop']: print " generating cropped frequency stratified qq plot" ro.r('df$logp[df$logp > ' + str(cfg['crop']) + ']<-' + str(cfg['crop'])) ro.r('df$shape<-0') ro.r('df$shape[df$logp == ' + str(cfg['crop']) + ']<-1') if cfg['ext'] == 'tiff': ggsave = 'ggsave(filename="%s",plot=gp,width=4,height=4,units="in",bg="white",compression="lzw",dpi=300)' % (cfg['out'] + '.' + pcol + '.qq_strat_mac.cropped.tiff') elif cfg['ext'] == 'png': ggsave = 'ggsave(filename="%s",plot=gp,width=4,height=4,units="in",bg="white",dpi=300)' % (cfg['out'] + '.' + pcol + '.qq_strat_mac.cropped.png') elif cfg['ext'] == 'eps': ggsave = 'ggsave(filename="%s",plot=gp,width=4,height=4,bg="white")' % (cfg['out'] + '.' + pcol + '.qq_strat_mac.cropped.eps') else: ggsave = 'ggsave(filename="%s",plot=gp,width=4,height=4,bg="white")' % (cfg['out'] + '.' + pcol + '.qq_strat_mac.cropped.pdf') ro.r(""" gp<-ggplot(df, aes_string(x='expected',y='logp')) + geom_point(aes(shape=factor(shape), color=description), size=2) + scale_colour_manual(values=colorRampPalette(brewer.pal(9,"Blues"))(length(unique(df$description))+2)[3:(length(unique(df$description))+2)]) + geom_abline(intercept=0, slope=1, alpha=0.5) + scale_x_discrete(expression(Expected~~-log[10](italic(p)))) + scale_y_discrete(expression(Observed~~-log[10](italic(p)))) + coord_fixed() + theme_bw(base_size = 12) + guides(shape=FALSE) + theme(axis.title.x = element_text(vjust=-0.5,size=14), axis.title.y = element_text(vjust=1,angle=90,size=14), legend.title = element_blank(), legend.key.height = unit(0.1,"in"), legend.text = element_text(size=6), legend.key = element_blank(), legend.justification = c(0,1), legend.position = c(0,1), panel.background = element_blank(), panel.border = element_blank(), panel.grid.minor = element_blank(), panel.grid.major = element_blank(), axis.line = element_line(colour="black"), axis.text = element_text(size=12)) %s """ % (ggsave)) if cfg['mht']: print " generating standard manhattan plot" print " minimum p-value: " + str(np.min(results[pcol])) print " maximum -1*log10(p-value): " + str(np.max(results['logp'])) if cfg['gc'] and l > 1: print " adjusting p-values for genomic inflation for p-value column " + pcol results[pcol]=2 * scipy.norm.cdf(-1 * np.abs(scipy.norm.ppf(0.5*results[pcol]) / math.sqrt(l))) print " minimum post-gc adjustment p-value: " + str(np.min(results[pcol])) print " maximum post-gc adjustment -1*log10(p-value): " + str(np.max(results['logp'])) else: print " skipping genomic inflation correction" print " calculating genomic positions" results.sort_values(by=[cfg['chrcol'],cfg['bpcol']], inplace=True) ticks = [] lastbase = 0 results['gpos'] = 0 nchr = len(list(np.unique(results[cfg['chrcol']].values))) chrs = np.unique(results[cfg['chrcol']].values) if cfg['color']: colours = ["#08306B","#41AB5D","#000000","#F16913","#3F007D","#EF3B2C","#08519C","#238B45","#252525","#D94801","#54278F","#CB181D","#2171B5","#006D2C","#525252","#A63603","#6A51A3","#A50F15","#4292C6","#00441B","#737373","#7F2704","#807DBA","#67000D"] else: colours = ["#08589e","#4eb3d3","#08589e","#4eb3d3","#08589e","#4eb3d3","#08589e","#4eb3d3","#08589e","#4eb3d3","#08589e","#4eb3d3","#08589e","#4eb3d3","#08589e","#4eb3d3","#08589e","#4eb3d3","#08589e","#4eb3d3","#08589e","#4eb3d3","#08589e","#4eb3d3"] if nchr == 1: results['gpos'] = results[cfg['bpcol']] results['colours'] = "#08589e" if results['gpos'].max() - results['gpos'].min() <= 1000: ticks = [x for x in range(results['gpos'].min(),results['gpos'].max()) if x % 100 == 0] elif results['gpos'].max() - results['gpos'].min() <= 10000: ticks = [x for x in range(results['gpos'].min(),results['gpos'].max()) if x % 1000 == 0] elif results['gpos'].max() - results['gpos'].min() <= 100000: ticks = [x for x in range(results['gpos'].min(),results['gpos'].max()) if x % 10000 == 0] elif results['gpos'].max() - results['gpos'].min() <= 200000: ticks = [x for x in range(results['gpos'].min(),results['gpos'].max()) if x % 20000 == 0] elif results['gpos'].max() - results['gpos'].min() <= 300000: ticks = [x for x in range(results['gpos'].min(),results['gpos'].max()) if x % 30000 == 0] elif results['gpos'].max() - results['gpos'].min() <= 400000: ticks = [x for x in range(results['gpos'].min(),results['gpos'].max()) if x % 40000 == 0] elif results['gpos'].max() - results['gpos'].min() <= 500000: ticks = [x for x in range(results['gpos'].min(),results['gpos'].max()) if x % 50000 == 0] elif results['gpos'].max() - results['gpos'].min() <= 600000: ticks = [x for x in range(results['gpos'].min(),results['gpos'].max()) if x % 60000 == 0] elif results['gpos'].max() - results['gpos'].min() <= 700000: ticks = [x for x in range(results['gpos'].min(),results['gpos'].max()) if x % 70000 == 0] elif results['gpos'].max() - results['gpos'].min() <= 800000: ticks = [x for x in range(results['gpos'].min(),results['gpos'].max()) if x % 80000 == 0] elif results['gpos'].max() - results['gpos'].min() <= 900000: ticks = [x for x in range(results['gpos'].min(),results['gpos'].max()) if x % 90000 == 0] elif results['gpos'].max() - results['gpos'].min() <= 1000000: ticks = [x for x in range(results['gpos'].min(),results['gpos'].max()) if x % 100000 == 0] elif results['gpos'].max() - results['gpos'].min() <= 10000000: ticks = [x for x in range(results['gpos'].min(),results['gpos'].max()) if x % 1000000 == 0] elif results['gpos'].max() - results['gpos'].min() <= 100000000: ticks = [x for x in range(results['gpos'].min(),results['gpos'].max()) if x % 10000000 == 0] elif results['gpos'].max() - results['gpos'].min() > 100000000: ticks = [x for x in range(results['gpos'].min(),results['gpos'].max()) if x % 25000000 == 0] else: results['colours'] = "#000000" for i in range(len(chrs)): print " processed chromosome " + str(int(chrs[i])) if i == 0: results.loc[results[cfg['chrcol']] == chrs[i],'gpos'] = results.loc[results[cfg['chrcol']] == chrs[i],cfg['bpcol']] else: lastbase = lastbase + results.loc[results[cfg['chrcol']] == chrs[i-1],cfg['bpcol']].iloc[-1] results.loc[results[cfg['chrcol']] == chrs[i],'gpos'] = (results.loc[results[cfg['chrcol']] == chrs[i],cfg['bpcol']]) + lastbase if results.loc[results[cfg['chrcol']] == chrs[i]].shape[0] > 1: ticks.append(results.loc[results[cfg['chrcol']] == chrs[i],'gpos'].iloc[0] + (results.loc[results[cfg['chrcol']] == chrs[i],'gpos'].iloc[-1] - results.loc[results[cfg['chrcol']] == chrs[i],'gpos'].iloc[0])/2) else: ticks.append(results.loc[results[cfg['chrcol']] == chrs[i],'gpos'].iloc[0]) results.loc[results[cfg['chrcol']] == chrs[i],'colours'] = colours[int(chrs[i])] results['logp'] = -1 * np.log10(results[pcol]) if results.shape[0] >= 1000000: sig = 5.4e-8 else: sig = 0.05 / results.shape[0] print " significance level set to p-value = " + str(sig) + " (-1*log10(p-value) = " + str(-1 * np.log10(sig)) + ")" print " " + str(len(results[pcol][results[pcol] <= sig])) + " genome wide significant variants" chr = results[cfg['chrcol']][0] maxy=int(max(np.ceil(-1 * np.log10(sig)),np.ceil(results['logp'].max()))) if maxy > 20: y_breaks = range(0,maxy,5) y_labels = range(0,maxy,5) else: y_breaks = range(0,maxy) y_labels = range(0,maxy) ro.globalenv['df'] = ro.DataFrame({'gpos': ro.FloatVector(results['gpos']), 'logp': ro.FloatVector(results['logp']), 'colours': ro.FactorVector(results['colours'])}) ro.globalenv['ticks'] = ro.FloatVector(ticks) ro.globalenv['labels'] = ro.Vector(["{:,}".format(x/1000) for x in ticks]) ro.globalenv['colours'] = ro.StrVector(colours) ro.globalenv['chrs'] = ro.FloatVector(chrs) print " generating manhattan plot" if cfg['ext'] == 'tiff': ggsave = 'ggsave(filename="%s",plot=gp,width=16,height=4,units="in",bg="white",compression="lzw",dpi=300)' % (cfg['out'] + '.' + pcol + '.mht.tiff') elif cfg['ext'] == 'png': ggsave = 'ggsave(filename="%s",plot=gp,width=16,height=4,units="in",bg="white",dpi=300)' % (cfg['out'] + '.' + pcol + '.mht.png') elif cfg['ext'] == 'eps': ggsave = 'ggsave(filename="%s",plot=gp,width=16,height=4,bg="white")' % (cfg['out'] + '.' + pcol + '.mht.eps') else: ggsave = 'ggsave(filename="%s",plot=gp,width=16,height=4,bg="white")' % (cfg['out'] + '.' + pcol + '.mht.pdf') if nchr == 1: ro.r(""" gp<-ggplot(df, aes_string(x='gpos',y='logp')) + geom_hline(yintercept = -1 * log10(%g),colour="#B8860B", linetype=5, size = 0.25) + geom_point(size=1.5) + scale_x_continuous(expression(Chromosome~~%d~~(kb))'),breaks=ticks,labels=labels) + \ scale_y_continuous(expression(-log[10](italic(p))),breaks=seq(0,%d,1),limits=c(0,%d)) + \ theme_bw(base_size = 8) + \ theme(axis.title.x = element_text(vjust=-0.5,size=14), axis.title.y = element_text(vjust=1,angle=90,size=14), panel.background = element_blank(), panel.border = element_blank(), panel.grid.minor = element_blank(), panel.grid.major = element_blank(), axis.line = element_line(colour="black"), axis.title = element_text(size=10), axis.text = element_text(size=12), legend.position = 'none') %s """ % (sig, chr, maxy, maxy, ggsave)) else: ro.r(""" gp = ggplot(df, aes_string(x='gpos',y='logp',colour='colours')) + geom_hline(yintercept = -1 * log10(%g),colour="#B8860B", linetype=5, size = 0.25) + geom_point(size=1.5) + scale_colour_manual(values=colours) + scale_x_continuous(expression(Chromosome),breaks=ticks,labels=chrs) + scale_y_continuous(expression(-log[10](italic(p))),breaks=seq(0,%d,1),limits=c(0,%d)) + theme_bw(base_size = 8) + theme(axis.title.x = element_text(vjust=-0.5,size=14), axis.title.y = element_text(vjust=1,angle=90,size=14), panel.background = element_blank(), panel.border = element_blank(), panel.grid.minor = element_blank(), panel.grid.major = element_blank(), axis.line = element_line(colour="black"), axis.title = element_text(size=10), axis.text = element_text(size=12), legend.position = 'none') %s """ % (sig, maxy, maxy, ggsave)) if maxy > cfg['crop']: maxy = cfg['crop'] ro.r('df$logp[df$logp > ' + str(cfg['crop']) + ']<-' + str(cfg['crop'])) ro.r('df$shape<-0') ro.r('df$shape[df$logp == ' + str(cfg['crop']) + ']<-1') print " generating cropped manhattan plot" if cfg['ext'] == 'tiff': ggsave = 'ggsave(filename="%s",plot=gp,width=16,height=4,units="in",bg="white",compression="lzw",dpi=300)' % (cfg['out'] + '.' + pcol + '.mht.cropped.tiff') elif cfg['ext'] == 'png': ggsave = 'ggsave(filename="%s",plot=gp,width=16,height=4,units="in",bg="white",dpi=300)' % (cfg['out'] + '.' + pcol + '.mht.cropped.png') elif cfg['ext'] == 'eps': ggsave = 'ggsave(filename="%s",plot=gp,width=16,height=4,bg="white")' % (cfg['out'] + '.' + pcol + '.mht.cropped.eps') else: ggsave = 'ggsave(filename="%s",plot=gp,width=16,height=4,bg="white")' % (cfg['out'] + '.' + pcol + '.mht.cropped.pdf') if nchr == 1: ro.r(""" gp<-ggplot(df, aes_string(x='gpos',y='logp')) + geom_hline(yintercept = -1 * log10(%g),colour="#B8860B", linetype=5, size = 0.25) + geom_point(aes(shape=factor(shape)),size=1.5) + scale_x_continuous(expression(Chromosome~~%d~~(kb))'),breaks=ticks,labels=labels) + scale_y_continuous(expression(-log[10](italic(p))),breaks=seq(0,%d,1),limits=c(0,%d)) + theme_bw(base_size = 8) + theme(axis.title.x = element_text(vjust=-0.5,size=14), axis.title.y = element_text(vjust=1,angle=90,size=14), panel.background = element_blank(), panel.border = element_blank(), panel.grid.minor = element_blank(), panel.grid.major = element_blank(), axis.line = element_line(colour="black"), axis.title = element_text(size=10), axis.text = element_text(size=12), legend.position = 'none') %s """ % (sig, chr, maxy, maxy, ggsave)) else: ro.r(""" gp = ggplot(df, aes_string(x='gpos',y='logp',colour='colours')) + geom_hline(yintercept = -1 * log10(%g),colour="#B8860B", linetype=5, size = 0.25) + geom_point(aes(shape=factor(shape)),size=1.5) + scale_colour_manual(values=colours) + scale_x_continuous(expression(Chromosome),breaks=ticks,labels=chrs) + scale_y_continuous(expression(-log[10](italic(p))),breaks=seq(0,%d,1),limits=c(0,%d)) + theme_bw(base_size = 8) + theme(axis.title.x = element_text(vjust=-0.5,size=14), axis.title.y = element_text(vjust=1,angle=90,size=14), panel.background = element_blank(), panel.border = element_blank(), panel.grid.minor = element_blank(), panel.grid.major = element_blank(), axis.line = element_line(colour="black"), axis.title = element_text(size=8), axis.text = element_text(size=12), legend.position = 'none') %s """ % (sig, maxy, maxy, ggsave)) print "process complete" return 0