def genebody_percentile(refbed, mRNA_len_cut=100): ''' return percentile points of gene body mRNA length < mRNA_len_cut will be skipped ''' import numpy if refbed is None: print >> sys.stderr, "You must specify a bed file representing gene model\n" exit(0) g_percentiles = {} transcript_count = 0 for line in open(refbed, 'r'): try: if line.startswith(('#', 'track', 'browser')): continue # Parse fields from gene tabls fields = line.split() chrom = fields[0] tx_start = int(fields[1]) tx_end = int(fields[2]) geneName = fields[3] strand = fields[5] geneID = '_'.join( [str(j) for j in (chrom, tx_start, tx_end, geneName, strand)]) exon_starts = map(int, fields[11].rstrip(',\n').split(',')) exon_starts = map((lambda x: x + tx_start), exon_starts) exon_ends = map(int, fields[10].rstrip(',\n').split(',')) exon_ends = map((lambda x, y: x + y), exon_starts, exon_ends) transcript_count += 1 except: print >> sys.stderr, "[NOTE:input bed must be 12-column] skipped this line: " + line, continue gene_all_base = [] mRNA_len = 0 flag = 0 for st, end in zip(exon_starts, exon_ends): gene_all_base.extend(range(st + 1, end + 1)) #1-based coordinates on genome if len(gene_all_base) < mRNA_len_cut: continue #get 100 points pos_select = [] g_percentiles[geneID] = ( chrom, strand, mystat.percentile_list(gene_all_base[-mRNA_len_cut:]) ) #get 100 points from each gene's coordinates if len(gene_all_base) <= mRNA_len_cut: continue #get 100 points pos_select = [] g_percentiles[geneID] = ( chrom, strand, mystat.percentile_list(gene_all_base[mRNA_len_cut:]) ) #get 100 points from each gene's coordinates printlog("Total " + str(transcript_count) + ' transcripts loaded') return g_percentiles
def genebody_percentile(anno, gene_filter, mRNA_len_cut = 100): ''' return percentile points of gene body mRNA length < mRNA_len_cut will be skipped ''' g_percentiles = {} g_filter = [] if gene_filter: with open(gene_filter) as f: for line in f: line = line.rstrip() word = line.split("\t") g_filter.append(word[0]) for line in open(anno,'r'): if line.startswith('Ensembl'):continue # Parse fields from gene tabls fields = line.split() if fields[1] == "MT": chrom = "chrM" elif fields[1] == "X": chrom = "chrX" elif fields[1] == "Y": chrom = "chrY" elif fields[1].isdigit(): chrom = "chr" + fields[1] else: continue tx_start = int( fields[2] ) tx_end = int( fields[3] ) geneName = fields[0] if fields[4] == "1": strand = "+" else: strand = "-" geneID = '_'.join([str(j) for j in (chrom, tx_start, tx_end, geneName, strand)]) gene_all_base=[] if g_filter: if geneName in g_filter: gene_all_base.extend(range(tx_start+1,tx_end+1)) #1-based coordinates on genome if len(gene_all_base) < mRNA_len_cut: continue g_percentiles[geneID] = (chrom, strand, mystat.percentile_list (gene_all_base)) #get 100 points from each gene's coordinates else: gene_all_base.extend(range(tx_start+1,tx_end+1)) #1-based coordinates on genome if len(gene_all_base) < mRNA_len_cut: continue g_percentiles[geneID] = (chrom, strand, mystat.percentile_list (gene_all_base)) #get 100 points from each gene's coordinates return g_percentiles
def Rcode_write(dataset, file_prefix, format='pdf', length=100): '''generate R script for visualization''' ROUT = open(file_prefix + '.r', 'w') names = [] datas = [] for name, data, tmp in dataset: names.append(name) datas.append(data) print >> ROUT, name + ' <- c(' + ','.join([str(i) for i in data]) + ')' x = mystat.percentile_list(range(1, length + 1)) print >> ROUT, '\n' print >> ROUT, '%s(\"%s.%s\")' % (format.lower(), file_prefix + ".curves", format.lower()) print >> ROUT, "x %s" % (' <- c(' + ','.join([str(i) for i in x]) + ')') print >> ROUT, 'icolor = colorRampPalette(c("#7fc97f","#beaed4","#fdc086","#ffff99","#386cb0","#f0027f"))(%d)' % ( len(names)) if len(names) == 1: print >> ROUT, "plot(x,%s,type='l',xlab=\"Gene body, bp (5\'->3\')\", ylab=\"Starts coverage\",lwd=0.8,col=icolor[1])" % ( names[0]) elif len(names) >= 2 and len(names) <= 6: print >> ROUT, "plot(x,%s,type='l',xlab=\"Gene body, bp (5\'->3\')\", ylab=\"Starts coverage\",lwd=0.8,col=icolor[1])" % ( names[0]) for i in range(1, len(names)): print >> ROUT, "lines(x,%s,type='l',col=icolor[%d])" % (names[i], i + 1) print >> ROUT, "legend(0,1,fill=icolor[%d:%d], legend=c(%s))" % ( 1, len(names), ','.join(["'" + str(n) + "'" for n in names])) elif len(names) > 6: print >> ROUT, 'layout(matrix(c(1,1,1,2,1,1,1,2,1,1,1,2), 4, 4, byrow = TRUE))' print >> ROUT, "plot(x,%s,type='l',xlab=\"Gene body, bp (5\'->3\')\", ylab=\"Starts coverage\",lwd=0.8,col=icolor[1])" % ( names[0]) for i in range(1, len(names)): print >> ROUT, "lines(x,%s,type='l',col=icolor[%d])" % (names[i], i + 1) print >> ROUT, 'par(mar=c(1,0,2,1))' print >> ROUT, 'plot.new()' print >> ROUT, "legend(0,1,fill=icolor[%d:%d],legend=c(%s))" % ( 1, len(names), ','.join(["'" + str(n) + "'" for n in names])) print >> ROUT, 'dev.off()' ROUT.close()
def genebody_percentile(refbed, mRNA_len_cut = 100): ''' return percentile points of gene body mRNA length < mRNA_len_cut will be skipped ''' if refbed is None: print >>sys.stderr,"You must specify a bed file representing gene model\n" exit(0) g_percentiles = {} transcript_count = 0 for line in open(refbed,'r'): try: if line.startswith(('#','track','browser')):continue # Parse fields from gene tabls fields = line.split() chrom = fields[0] tx_start = int( fields[1] ) tx_end = int( fields[2] ) geneName = fields[3] strand = fields[5] geneID = '_'.join([str(j) for j in (chrom, tx_start, tx_end, geneName, strand)]) exon_starts = map( int, fields[11].rstrip( ',\n' ).split( ',' ) ) exon_starts = map((lambda x: x + tx_start ), exon_starts) exon_ends = map( int, fields[10].rstrip( ',\n' ).split( ',' ) ) exon_ends = map((lambda x, y: x + y ), exon_starts, exon_ends) transcript_count += 1 except: print >>sys.stderr,"[NOTE:input bed must be 12-column] skipped this line: " + line, continue gene_all_base=[] mRNA_len =0 flag=0 for st,end in zip(exon_starts,exon_ends): gene_all_base.extend(range(st+1,end+1)) #1-based coordinates on genome if len(gene_all_base) < mRNA_len_cut: continue g_percentiles[geneID] = (chrom, strand, mystat.percentile_list (gene_all_base)) #get 100 points from each gene's coordinates printlog("Total " + str(transcript_count) + ' transcripts loaded') return g_percentiles
def coverageGeneBody_bigwig(bigFile, refbed, outfile, gtype="png"): '''Calculate reads coverage over gene body, from 5'to 3'. each gene will be equally divided into 100 regsions. bigFile is bigwig format file''' if refbed is None: print >> sys.stderr, "You must specify a bed file representing gene model\n" exit(0) OUT1 = open(outfile + ".geneBodyCoverage_plot.r", 'w') OUT2 = open(outfile + ".geneBodyCoverage.txt", 'w') bw = BigWigFile(file=open(bigFile)) print >> sys.stderr, "calculating coverage over gene body ..." coverage = collections.defaultdict(int) flag = 0 gene_count = 0 for line in open(refbed, 'r'): try: if line.startswith(('#', 'track', 'browser')): continue gene_count += 1 # Parse fields from gene tabls fields = line.split() chrom = fields[0] tx_start = int(fields[1]) tx_end = int(fields[2]) geneName = fields[3] strand = fields[5] exon_starts = map(int, fields[11].rstrip(',\n').split(',')) exon_starts = map((lambda x: x + tx_start), exon_starts) exon_ends = map(int, fields[10].rstrip(',\n').split(',')) exon_ends = map((lambda x, y: x + y), exon_starts, exon_ends) except: print >> sys.stderr, "[NOTE:input bed must be 12-column] skipped this line: " + line, continue gene_all_base = [] percentile_base = [] mRNA_len = 0 flag = 0 for st, end in zip(exon_starts, exon_ends): gene_all_base.extend(range(st + 1, end + 1)) #0-based coordinates on genome mRNA_len = len(gene_all_base) if mRNA_len < 100: flag = 1 break if flag == 1: continue if strand == '-': gene_all_base.sort(reverse=True) #deal with gene on minus stand else: gene_all_base.sort(reverse=False) percentile_base = mystat.percentile_list( gene_all_base) #get 101 points from each gene's coordinates for i in range(0, len(percentile_base)): #try: sig = bw.get_as_array(chrom, percentile_base[i] - 1, percentile_base[i]) if sig is None: continue coverage[i] += np.nan_to_num(sig[0]) #except: # continue print >> sys.stderr, " %d genes finished\r" % gene_count, x_coord = [] y_coord = [] print >> OUT2, "percentile\tcount" for i in coverage: x_coord.append(str(i)) y_coord.append(str(coverage[i])) print >> OUT2, str(i) + '\t' + str(coverage[i]) print >> OUT1, "%s(\'%s\')" % (gtype, outfile + ".geneBodyCoverage." + gtype) print >> OUT1, "x=0:100" print >> OUT1, "y=c(" + ','.join(y_coord) + ')' print >> OUT1, "plot(x,y/%s,xlab=\"percentile of gene body (5'->3')\",ylab='average wigsum',type='s')" % gene_count print >> OUT1, "dev.off()"
def coverageGeneBody_bigwig(bigFile, refbed, outfile, gtype='png'): '''Calculate reads coverage over gene body, from 5'to 3'. each gene will be equally divided into 100 regsions. bigFile is bigwig format file''' if refbed is None: print("You must specify a bed file representing gene model", file=sys.stderr) exit(0) bw = openBigWig(bigFile, 'r') # Get chromosomes present in the bigwig file chroms = bw.chroms().keys() print("calculating coverage over gene body ...", file=sys.stderr) coverage = defaultdict(int) flag = 0 gene_count = 0 with open(refbed, 'r') as handle: # Loop through the genes in the BED file for line in handle: try: if line.startswith(('#', 'track', 'browser')): continue # Parse fields from gene tabls fields = line.split() chrom = fields[0] tx_start = int(fields[1]) tx_end = int(fields[2]) geneName = fields[3] strand = fields[5] # Skip chromosomes present in the bed file but not present in the bigwig file. # This could happen with PATCHES or Unplaced chromosomes. if chrom not in chroms: continue exon_starts = list( map(int, fields[11].rstrip(',\n').split(','))) exon_starts = list(map((lambda x: x + tx_start), exon_starts)) exon_ends = list(map(int, fields[10].rstrip(',\n').split(','))) exon_ends = list( map((lambda x, y: x + y), exon_starts, exon_ends)) except: print( "[NOTE: input bed must be 12-column] skipped this line: " + line, end='\n', file=sys.stderr) continue # Count gene if it was properly read gene_count += 1 gene_all_base = [] mRNA_len = 0 flag = 0 for st, end in zip(exon_starts, exon_ends): gene_all_base.extend(range(st + 1, end + 1)) # 0-based coordinates on genome mRNA_len = len(gene_all_base) if mRNA_len < 100: flag = 1 break if flag == 1: continue # Sort coordinates according to the strand if strand == '-': gene_all_base.sort(reverse=True) else: gene_all_base.sort(reverse=False) # Get 100 points from each gene's coordinates percentile_base = [] percentile_base = mystat.percentile_list(gene_all_base) for i in range(0, len(percentile_base)): sig = bw.values(chrom, percentile_base[i] - 1, percentile_base[i]) coverage[i] += nan_to_num(sig[0]) print(" \t%d genes finished\r" % gene_count, end=' ', file=sys.stderr) # Close bigwig file bw.close() print("\n", file=sys.stderr) x_coord = [] y_coord = [] with open(outfile + ".geneBodyCoverage.txt", 'w') as handle: handle.write("percentile\tcount\n") for i in coverage: x_coord.append(str(i)) y_coord.append(str(coverage[i])) handle.write("%i\t%i\n" % (i, coverage[i])) with open(outfile + ".geneBodyCoverage_plot.r", 'w') as handle: handle.write("%s(\'%s\')\n" % (gtype, outfile + ".geneBodyCoverage." + gtype)) handle.write("x=1:100\n") handle.write("y=c(%s)\n" % ','.join(y_coord)) handle.write( "plot(x, y/%s, xlab=\"percentile of gene body (5'->3')\", ylab='average wigsum', type='s')\n" % gene_count) handle.write("dev.off()\n")
def coverageGeneBody_bigwig(bigFile,refbed,outfile,gtype="png"): '''Calculate reads coverage over gene body, from 5'to 3'. each gene will be equally divided into 100 regsions. bigFile is bigwig format file''' if refbed is None: print >>sys.stderr,"You must specify a bed file representing gene model\n" exit(0) OUT1 = open(outfile + ".geneBodyCoverage_plot.r",'w') OUT2 = open(outfile + ".geneBodyCoverage.txt",'w') bw = BigWigFile( file = open(bigFile) ) print >>sys.stderr, "calculating coverage over gene body ..." coverage=collections.defaultdict(int) flag=0 gene_count = 0 for line in open(refbed,'r'): try: if line.startswith(('#','track','browser')):continue gene_count += 1 # Parse fields from gene tabls fields = line.split() chrom = fields[0] tx_start = int( fields[1] ) tx_end = int( fields[2] ) geneName = fields[3] strand = fields[5] exon_starts = map( int, fields[11].rstrip( ',\n' ).split( ',' ) ) exon_starts = map((lambda x: x + tx_start ), exon_starts) exon_ends = map( int, fields[10].rstrip( ',\n' ).split( ',' ) ) exon_ends = map((lambda x, y: x + y ), exon_starts, exon_ends); except: print >>sys.stderr,"[NOTE:input bed must be 12-column] skipped this line: " + line, continue gene_all_base=[] percentile_base=[] mRNA_len =0 flag=0 for st,end in zip(exon_starts,exon_ends): gene_all_base.extend(range(st+1,end+1)) #0-based coordinates on genome mRNA_len = len(gene_all_base) if mRNA_len <100: flag=1 break if flag==1: continue if strand == '-': gene_all_base.sort(reverse=True) #deal with gene on minus stand else: gene_all_base.sort(reverse=False) percentile_base = mystat.percentile_list (gene_all_base) #get 101 points from each gene's coordinates for i in range(0,len(percentile_base)): #try: sig = bw.get_as_array(chrom,percentile_base[i]-1,percentile_base[i]) if sig is None:continue coverage[i] += np.nan_to_num(sig[0]) #except: # continue print >>sys.stderr, " %d genes finished\r" % gene_count, x_coord=[] y_coord=[] print >>OUT2, "percentile\tcount" for i in coverage: x_coord.append(str(i)) y_coord.append(str(coverage[i])) print >>OUT2, str(i) + '\t' + str(coverage[i]) print >>OUT1, "%s(\'%s\')" % (gtype, outfile + ".geneBodyCoverage." + gtype) print >>OUT1, "x=0:100" print >>OUT1, "y=c(" + ','.join(y_coord) + ')' print >>OUT1, "plot(x,y/%s,xlab=\"percentile of gene body (5'->3')\",ylab='average wigsum',type='s')" % gene_count print >>OUT1, "dev.off()"