def main(): usage = "%prog [options]" + "\n" parser = OptionParser(usage, version="%prog " + __version__) parser.add_option( "-i", "--input-file", action="store", type="string", dest="input_file", help= "BED6+ file specifying the C position. This BED file should have at least 6 columns (Chrom, ChromStart, ChromeEnd, Name, Beta_value, Strand). BED6+ file can be a regular text file or compressed file (*.gz, *.bz2) or accessible url." ) parser.add_option( "-r", "--refgene", action="store", type="string", dest="gene_file", help= "Reference gene model in standard BED12 format (https://genome.ucsc.edu/FAQ/FAQformat.html#format1). \"Strand\" column must exist in order to decide 5' and 3' UTRs, up- and down-stream intergenic regions." ) parser.add_option( "-d", "--downstream", action="store", type="int", dest="downstream_size", default=2000, help= "Size of down-stream genomic region added to gene. default=%default (bp)" ) parser.add_option( "-u", "--upstream", action="store", type="int", dest="upstream_size", default=2000, help= "Size of up-stream genomic region added to gene. default=%default (bp)" ) parser.add_option("-o", "--output", action="store", type='string', dest="out_file", help="Prefix of the output file.") (options, args) = parser.parse_args() print() if not (options.input_file): print(__doc__) parser.print_help() sys.exit(101) if not (options.gene_file): print(__doc__) parser.print_help() sys.exit(102) if not (options.out_file): print(__doc__) parser.print_help() sys.exit(103) FOUT = open(options.out_file + '.txt', 'w') ROUT = open(options.out_file + '.r', 'w') print("\t".join(["Group", "Relative_position(5'->3')", "Average_beta"]), file=FOUT) #step1: read CpG file printlog("Reading CpG file: \"%s\"" % (options.input_file)) cpg_ranges = read_CpG_bed(options.input_file) #step2: read gene file printlog("Reading reference gene model: \"%s\"" % (options.gene_file)) ref_gene = BED.ParseBED(options.gene_file) group_sizes = [] #number of datapoints in each group printlog("Process upstream regions ...") up_2k = ref_gene.getIntergenic(direction='up', size=options.upstream_size) s = coverage_over_range(up_2k, cpg_ranges) group_sizes.append(len(s)) for i in sorted(s): print('\t'.join(['Upstream_intergenic', str(i), str(s[i])]), file=FOUT) print('Upstream_intergenic_y=c(%s)' % ','.join([str(s[i]) for i in sorted(s)]), file=ROUT) printlog("Process 5' UTR exons ...") utr5_exons = ref_gene.getUTRs(utr=5) s = coverage_over_range(utr5_exons, cpg_ranges) group_sizes.append(len(s)) for i in sorted(s): print('\t'.join(['Five_prime_UTR', str(i), str(s[i])]), file=FOUT) print('Five_prime_UTR_y=c(%s)' % ','.join([str(s[i]) for i in sorted(s)]), file=ROUT) printlog("Process Coding exons ...") cds_exons = ref_gene.getCDSExons() s = coverage_over_range(cds_exons, cpg_ranges) group_sizes.append(len(s)) for i in sorted(s): print('\t'.join(['Coding_exon', str(i), str(s[i])]), file=FOUT) print('Coding_exon_y=c(%s)' % ','.join([str(s[i]) for i in sorted(s)]), file=ROUT) printlog("Process first introns ...") introns = ref_gene.getIntrons(itype='first') s = coverage_over_range(introns, cpg_ranges) group_sizes.append(len(s)) for i in sorted(s): print('\t'.join(['First_intron', str(i), str(s[i])]), file=FOUT) print('First_intron_y=c(%s)' % ','.join([str(s[i]) for i in sorted(s)]), file=ROUT) printlog("Process internal introns ...") introns = ref_gene.getIntrons(itype='internal') s = coverage_over_range(introns, cpg_ranges) group_sizes.append(len(s)) for i in sorted(s): print('\t'.join(['Internal_intron', str(i), str(s[i])]), file=FOUT) print('Internal_intron_y=c(%s)' % ','.join([str(s[i]) for i in sorted(s)]), file=ROUT) printlog("Process last introns ...") introns = ref_gene.getIntrons(itype='last') s = coverage_over_range(introns, cpg_ranges) group_sizes.append(len(s)) for i in sorted(s): print('\t'.join(['Last_intron', str(i), str(s[i])]), file=FOUT) print('Last_intron_y=c(%s)' % ','.join([str(s[i]) for i in sorted(s)]), file=ROUT) printlog("Process 3' UTR exons ...") utr3_exons = ref_gene.getUTRs(utr=3) s = coverage_over_range(utr3_exons, cpg_ranges) group_sizes.append(len(s)) for i in sorted(s): print('\t'.join(['Three_prime_UTR', str(i), str(s[i])]), file=FOUT) print('Three_prime_UTR_y=c(%s)' % ','.join([str(s[i]) for i in sorted(s)]), file=ROUT) printlog("Process downstream regions ...") down_2k = ref_gene.getIntergenic(direction='down', size=options.downstream_size) s = coverage_over_range(down_2k, cpg_ranges) group_sizes.append(len(s)) for i in sorted(s): print('\t'.join(['Downstream_intergenic', str(i), str(s[i])]), file=FOUT) print('Downstream_intergenic_y=c(%s)' % ','.join([str(s[i]) for i in sorted(s)]), file=ROUT) print('\n') print('pdf(file=\"%s\", width=10, height=5)' % (options.out_file + '.pdf'), file=ROUT) print( 'plot(1:%d, c(Upstream_intergenic_y, Five_prime_UTR_y, Coding_exon_y, First_intron_y, Internal_intron_y, Last_intron_y, Three_prime_UTR_y, Downstream_intergenic_y),ylim=c(0,1), xaxt="n",xlab="", ylab="Average methylation", type="l", col="red")' % sum(group_sizes), file=ROUT) print( 'abline(v = c(100,201,302,403,504,605,706),col="blue", lty="dashed")', file=ROUT) print('abline(v = c(%d,%d,%d,%d,%d,%d,%d),col="blue", lty="dashed")' % (sum(group_sizes[0:1]), sum(group_sizes[0:2]), sum(group_sizes[0:3]), sum(group_sizes[0:4]), sum(group_sizes[0:5]), sum( group_sizes[0:6]), sum(group_sizes[0:7])), file=ROUT) print('abline(h = 0.5,col="grey", lty="dashed")', file=ROUT) print( 'text(x=c(%d,%d,%d,%d,%d,%d,%d, %d)+50, y=0.9, cex=0.7, labels=c("Upstream\\n(5\'->3\')", "5\'UTR exon\\n(5\'->3\')","Coding exon\\n(5\'->3\')","First intron\\n(5\'->3\')","Internal intron\\n(5\'->3\')","Last intron\\n(5\'->3\')", "3\'UTR exon\\n(5\'->3\')","Downstream\n(5\'->3\')"))' % (0, sum(group_sizes[0:1]), sum(group_sizes[0:2]), sum(group_sizes[0:3]), sum(group_sizes[0:4]), sum(group_sizes[0:5]), sum(group_sizes[0:6]), sum(group_sizes[0:7])), file=ROUT) print('dev.off()', file=ROUT) FOUT.close() ROUT.close() try: subprocess.call("Rscript " + options.out_file + '.r', shell=True) except: print("Cannot generate pdf file from " + options.out_file + '.r', file=sys.stderr) pass
def main(): usage="%prog [options]" + "\n" parser = OptionParser(usage,version="%prog " + __version__) parser.add_option("-i","--input_file",action="store",type="string",dest="input_file",help="BED file specifying the C position. This BED file should have at least three columns (Chrom, ChromStart, ChromeEnd). Note: the first base in a chromosome is numbered 0. This file can be a regular text file or compressed file (.gz, .bz2).") parser.add_option("-r","--refgene",action="store",type="string",dest="gene_file",help="Reference gene model in standard BED-12 format (https://genome.ucsc.edu/FAQ/FAQformat.html#format1). ") parser.add_option("-d","--downstream",action="store",type="int",dest="downstream_size",default=2000,help="Size of down-stream intergenic region w.r.t. TES (transcription end site). default=%default (bp)") parser.add_option("-u","--upstream",action="store",type="int",dest="upstream_size",default=2000,help="Size of up-stream intergenic region w.r.t. TSS (transcription start site). default=%default (bp)") parser.add_option("-o","--output",action="store",type='string', dest="out_file",help="The prefix of the output file.") (options,args)=parser.parse_args() print () if not (options.input_file): print (__doc__) parser.print_help() sys.exit(101) if not (options.gene_file): print (__doc__) parser.print_help() sys.exit(102) if not (options.out_file): print (__doc__) parser.print_help() sys.exit(103) FOUT = open(options.out_file + '.tsv','w') ROUT = open(options.out_file + '.r','w') #step1: read CpG file printlog("Reading CpG file: \"%s\"" % (options.input_file)) cpg_ranges = read_CpG_bed(options.input_file) #step2: read gene file printlog("Reading reference gene model: \"%s\"" % (options.gene_file)) ref_gene = BED.ParseBED(options.gene_file) result = [("Priority_order", "Name", "Number_of_regions", "Size_of_regions(bp)", "CpG_raw_count", "CpG_count_per_KB")] #priority order: #1 printlog("Extract Coding exons ...") cds_exons = ref_gene.getCDSExons(stranded=False) printlog("Merge Coding exons ...") cds_exons = BED.unionBed3(cds_exons) printlog("Count CpGs in Coding exons ...") (size,count) = count_over_range(cds_exons, cpg_ranges) result.append(['0','Coding exons', len(cds_exons), size, count, count*1000.0/size]) #Class, number_of_region, size_of_region, CpG_raw_count, CpG_count_perKb #priority order: #2 printlog("Extract UTR exons ...") utr_exons = ref_gene.getUTRs(utr=35, uniquify=True, stranded = False) printlog("Merge UTR exons ...") utr_exons = BED.unionBed3(utr_exons) printlog("Subtract regions with higher priority from UTR exons ...") utr_exons = BED.subtractBed3(utr_exons, cds_exons) #nucleotides of utr_exons that overlaps with coding exons will be removed printlog("Count CpGs in UTR exons ...") (size,count) = count_over_range(utr_exons, cpg_ranges) result.append(['1','UTR exons', len(utr_exons), size, count, count*1000.0/size]) #priority order: #3 printlog("Extract introns ...") introns = ref_gene.getIntrons(itype='all', uniquify=True, stranded=False) printlog("Merge introns ...") introns = BED.unionBed3(introns) printlog("Subtract regions with higher priority from introns ...") introns = BED.subtractBed3(introns, cds_exons) introns = BED.subtractBed3(introns, utr_exons) printlog("Count CpGs in introns ...") (size,count) = count_over_range(introns, cpg_ranges) result.append(['2','Introns', len(introns), size, count, count*1000.0/size]) #priority order: #4 printlog("Extract upstream intergenic regions ...") upstream = ref_gene.getIntergenic(direction='up', size=options.upstream_size, uniquify=True, stranded = False) printlog("Merge upstream intergenic regions ...") upstream = BED.unionBed3(upstream) printlog("Subtract regions with higher priority from upstream intergenic regions...") upstream = BED.subtractBed3(upstream, cds_exons) upstream = BED.subtractBed3(upstream, utr_exons) upstream = BED.subtractBed3(upstream, introns) printlog("Count CpGs in upstream regions ...") (size,count) = count_over_range(upstream, cpg_ranges) result.append(['3','Upstream of TSS', len(upstream), size, count, count*1000.0/size]) #priority order: #5 printlog("Extract downstream intergenic regions ...") downstream = ref_gene.getIntergenic(direction='down', size=options.downstream_size, uniquify=True, stranded = False) printlog("Merge downstream intergenic regions ...") downstream = BED.unionBed3(downstream) printlog("Subtract regions with higher priority from downstream intergenic regions...") downstream = BED.subtractBed3(downstream, cds_exons) downstream = BED.subtractBed3(downstream, utr_exons) downstream = BED.subtractBed3(downstream, introns) downstream = BED.subtractBed3(downstream, upstream) printlog("Count CpGs in downstream regions ...") (size,count) = count_over_range(downstream, cpg_ranges) result.append(['4','Downstream of TES', len(downstream), size, count, count*1000.0/size]) print('\n') names=[] #[0,1,2,3,4] labels = [] #[bed names] density=[] for tmp in result: print ('\t'.join([str(i) for i in tmp]), file=FOUT) names.append(tmp[0]) labels.append(tmp[1]) density.append(tmp[5]) FOUT.close() print("name = c(%s)" % ','.join(['"' + i + '"' for i in names[1:]]), file=ROUT) print("values = c(%s)" % ','.join([str(i) for i in density[1:]]), file=ROUT) print ('pdf("%s", width=8, height=6)' % (options.out_file + '.pdf'), file=ROUT) print ('layout(matrix(c(1,1,2,1,1,2), nrow=2, byrow=TRUE))', file=ROUT) print ('barplot(values,names.arg=name,col=c(%s),ylab="CpG per Kb")' % ','.join(colors(5)), file=ROUT) print ("plot(c(0, 1), c(0, 1), ann = F, bty = 'n', type = 'n', xaxt = 'n', yaxt = 'n')", file=ROUT) for name,label in zip(names[1:], labels[1:]): x_pos = 0.0 y_pos = 1-(int(name)*20.0 +5)/100 print ("text(x=%f, y=%f, labels=c(\"%s = %s\"),adj=c(0,0))" % (x_pos, y_pos,name,label), file=ROUT) print ('dev.off()', file=ROUT) ROUT.close() printlog("Running R script ...") try: subprocess.call("Rscript " + options.out_file + '.r', shell=True) except: print ("Cannot generate pdf file from " + options.out_file + '.r', file=sys.stderr) pass
def main(): usage = "%prog [options]" + "\n" parser = OptionParser(usage, version="%prog " + __version__) parser.add_option( "-i", "--cpg", action="store", type="string", dest="cpg_file", help= "BED file specifying the C position. This BED file should have at least three columns (Chrom, ChromStart, ChromeEnd). Note: the first base in a chromosome is numbered 0. This file can be a regular text file or compressed file (.gz, .bz2)." ) parser.add_option("-b", "--bed", action="store", type="string", dest="bed_files", help="List of BED files specifying the genomic regions.") parser.add_option("-o", "--output", action="store", type='string', dest="out_file", help="The prefix of the output file.") (options, args) = parser.parse_args() print() if not (options.cpg_file): print(__doc__) parser.print_help() sys.exit(101) if not (options.bed_files): print(__doc__) parser.print_help() sys.exit(101) if not (options.out_file): print(__doc__) parser.print_help() sys.exit(102) FOUT = open(options.out_file + '.txt', 'w') ROUT = open(options.out_file + '.r', 'w') #step1: read CpG file printlog("Reading CpG file: \"%s\"" % (options.cpg_file)) cpg_ranges = read_CpG_bed(options.cpg_file) #step2: check BED file printlog("Checking BED files: \"%s\"" % (options.bed_files)) input_bed_files = options.bed_files.replace(' ', '').split(',') for i in input_bed_files: if os.path.exists(i): print("\t%s" % i, file=sys.stderr) else: print("\"%s\" does not exist!" % i, file=sys.stderr) sys.exit(103) #step3: read, merge, and subtract BED file dat = {} result = [("Priority_order", "Name", "Number_of_regions", "Size_of_regions(bp)", "CpG_raw_count", "CpG_count_per_KB")] #step3.1: read the first BED file i = 0 printlog("Reading BED file: \"%s\"" % (input_bed_files[i])) file_name = os.path.basename(input_bed_files[i]) tmp = read_bed_as_list(input_bed_files[i]) printlog("Merging overlap entries in BED file: \"%s\"" % (input_bed_files[i])) dat[i] = BED.unionBed3(tmp) printlog("Counting CpGs ...") (size, count) = count_over_range(dat[i], cpg_ranges) result.append( [str(i), file_name, len(dat[i]), size, count, count * 1000.0 / size] ) #Class, number_of_region, size_of_region, CpG_raw_count, CpG_count_perKb #step3.2: read the remaining BED files for i in range(1, len(input_bed_files)): printlog("Reading BED file: \"%s\"" % (input_bed_files[i])) file_name = os.path.basename(input_bed_files[i]) tmp = read_bed_as_list(input_bed_files[i]) printlog("Merging overlap entries in BED file: \"%s\"" % (input_bed_files[i])) dat[i] = BED.unionBed3(tmp) for j in range(0, i): printlog("Subtract \"%s\" from \"%s\"" % (input_bed_files[j], input_bed_files[i])) dat[i] = BED.subtractBed3(dat[i], dat[j]) (size, count) = count_over_range(dat[i], cpg_ranges) result.append([ str(i), file_name, len(dat[i]), size, count, count * 1000.0 / size ]) print('\n') names = [] #[0,1,2,3,4,...] labels = [] #[bed names] density = [] for tmp in result: print('\t'.join([str(i) for i in tmp]), file=FOUT) names.append(tmp[0]) labels.append(tmp[1]) density.append(tmp[5]) FOUT.close() print("name = c(%s)" % ','.join(['"' + i + '"' for i in names[1:]]), file=ROUT) print("values = c(%s)" % ','.join([str(i) for i in density[1:]]), file=ROUT) print('pdf("%s", width=8, height=6)' % (options.out_file + '.pdf'), file=ROUT) print('layout(matrix(c(1,1,2,1,1,2), nrow=2, byrow=TRUE))', file=ROUT) print('barplot(values,names.arg=name,col=c(%s),ylab="CpG per Kb")' % ','.join(colors(len(input_bed_files))), file=ROUT) print( "plot(c(0, 1), c(0, 1), ann = F, bty = 'n', type = 'n', xaxt = 'n', yaxt = 'n')", file=ROUT) for name, label in zip(names[1:], labels[1:]): x_pos = 0.0 y_pos = 1 - (int(name) * 20.0 + 5) / 200 print("text(x=%f, y=%f, labels=c(\"%s = %s\"),adj=c(0,0))" % (x_pos, y_pos, name, label), file=ROUT) print('dev.off()', file=ROUT) ROUT.close() printlog("Running R script ...") try: subprocess.call("Rscript " + options.out_file + '.r', shell=True) except: print("Cannot generate pdf file from " + options.out_file + '.r', file=sys.stderr) pass