コード例 #1
0
def genebody_percentile(refbed, mRNA_len_cut=100):
    '''
	return percentile points of gene body
	mRNA length < mRNA_len_cut will be skipped
	'''
    import numpy
    if refbed is None:
        print >> sys.stderr, "You must specify a bed file representing gene model\n"
        exit(0)

    g_percentiles = {}
    transcript_count = 0
    for line in open(refbed, 'r'):
        try:
            if line.startswith(('#', 'track', 'browser')): continue
            # Parse fields from gene tabls
            fields = line.split()
            chrom = fields[0]
            tx_start = int(fields[1])
            tx_end = int(fields[2])
            geneName = fields[3]
            strand = fields[5]
            geneID = '_'.join(
                [str(j) for j in (chrom, tx_start, tx_end, geneName, strand)])

            exon_starts = map(int, fields[11].rstrip(',\n').split(','))
            exon_starts = map((lambda x: x + tx_start), exon_starts)
            exon_ends = map(int, fields[10].rstrip(',\n').split(','))
            exon_ends = map((lambda x, y: x + y), exon_starts, exon_ends)
            transcript_count += 1

        except:
            print >> sys.stderr, "[NOTE:input bed must be 12-column] skipped this line: " + line,
            continue
        gene_all_base = []
        mRNA_len = 0
        flag = 0
        for st, end in zip(exon_starts, exon_ends):
            gene_all_base.extend(range(st + 1, end +
                                       1))  #1-based coordinates on genome
        if len(gene_all_base) < mRNA_len_cut:
            continue
        #get 100 points
        pos_select = []
        g_percentiles[geneID] = (
            chrom, strand,
            mystat.percentile_list(gene_all_base[-mRNA_len_cut:])
        )  #get 100 points from each gene's coordinates
        if len(gene_all_base) <= mRNA_len_cut:
            continue
        #get 100 points
        pos_select = []
        g_percentiles[geneID] = (
            chrom, strand, mystat.percentile_list(gene_all_base[mRNA_len_cut:])
        )  #get 100 points from each gene's coordinates
    printlog("Total " + str(transcript_count) + ' transcripts loaded')
    return g_percentiles
コード例 #2
0
ファイル: plotter.py プロジェクト: pdl30/pyribotools
def genebody_percentile(anno, gene_filter, mRNA_len_cut = 100):
	'''
	return percentile points of gene body
	mRNA length < mRNA_len_cut will be skipped
	'''
	g_percentiles = {}
	g_filter = []
	if gene_filter:
		with open(gene_filter) as f:
			for line in f:
				line = line.rstrip()
				word = line.split("\t")
				g_filter.append(word[0])
	for line in open(anno,'r'):
		if line.startswith('Ensembl'):continue  
		# Parse fields from gene tabls
		fields = line.split()
		if fields[1] == "MT": chrom = "chrM"
		elif fields[1] == "X": chrom = "chrX"
		elif fields[1] == "Y": chrom = "chrY"
		elif fields[1].isdigit(): chrom = "chr" + fields[1]
		else: 
			continue
		tx_start  = int( fields[2] )
		tx_end    = int( fields[3] )
		geneName      = fields[0]
		if fields[4] == "1":
			strand = "+"
		else:
			strand = "-"
		geneID = '_'.join([str(j) for j in (chrom, tx_start, tx_end, geneName, strand)])
		gene_all_base=[]
		if g_filter:
			if geneName in g_filter:
				gene_all_base.extend(range(tx_start+1,tx_end+1))		#1-based coordinates on genome
				if len(gene_all_base) < mRNA_len_cut:
					continue
				g_percentiles[geneID] = (chrom, strand, mystat.percentile_list (gene_all_base))	#get 100 points from each gene's coordinates
		else:
			gene_all_base.extend(range(tx_start+1,tx_end+1))		#1-based coordinates on genome
			if len(gene_all_base) < mRNA_len_cut:
				continue
			g_percentiles[geneID] = (chrom, strand, mystat.percentile_list (gene_all_base))	#get 100 points from each gene's coordinates
	return g_percentiles
コード例 #3
0
def Rcode_write(dataset, file_prefix, format='pdf', length=100):
    '''generate R script for visualization'''
    ROUT = open(file_prefix + '.r', 'w')
    names = []
    datas = []
    for name, data, tmp in dataset:
        names.append(name)
        datas.append(data)
        print >> ROUT, name + ' <- c(' + ','.join([str(i) for i in data]) + ')'
    x = mystat.percentile_list(range(1, length + 1))

    print >> ROUT, '\n'

    print >> ROUT, '%s(\"%s.%s\")' % (format.lower(), file_prefix + ".curves",
                                      format.lower())
    print >> ROUT, "x %s" % (' <- c(' + ','.join([str(i) for i in x]) + ')')
    print >> ROUT, 'icolor = colorRampPalette(c("#7fc97f","#beaed4","#fdc086","#ffff99","#386cb0","#f0027f"))(%d)' % (
        len(names))

    if len(names) == 1:
        print >> ROUT, "plot(x,%s,type='l',xlab=\"Gene body, bp  (5\'->3\')\", ylab=\"Starts coverage\",lwd=0.8,col=icolor[1])" % (
            names[0])

    elif len(names) >= 2 and len(names) <= 6:
        print >> ROUT, "plot(x,%s,type='l',xlab=\"Gene body, bp  (5\'->3\')\", ylab=\"Starts coverage\",lwd=0.8,col=icolor[1])" % (
            names[0])
        for i in range(1, len(names)):
            print >> ROUT, "lines(x,%s,type='l',col=icolor[%d])" % (names[i],
                                                                    i + 1)
        print >> ROUT, "legend(0,1,fill=icolor[%d:%d], legend=c(%s))" % (
            1, len(names), ','.join(["'" + str(n) + "'" for n in names]))

    elif len(names) > 6:
        print >> ROUT, 'layout(matrix(c(1,1,1,2,1,1,1,2,1,1,1,2), 4, 4, byrow = TRUE))'
        print >> ROUT, "plot(x,%s,type='l',xlab=\"Gene body, bp  (5\'->3\')\", ylab=\"Starts coverage\",lwd=0.8,col=icolor[1])" % (
            names[0])
        for i in range(1, len(names)):
            print >> ROUT, "lines(x,%s,type='l',col=icolor[%d])" % (names[i],
                                                                    i + 1)
        print >> ROUT, 'par(mar=c(1,0,2,1))'
        print >> ROUT, 'plot.new()'
        print >> ROUT, "legend(0,1,fill=icolor[%d:%d],legend=c(%s))" % (
            1, len(names), ','.join(["'" + str(n) + "'" for n in names]))

    print >> ROUT, 'dev.off()'
    ROUT.close()
コード例 #4
0
def genebody_percentile(refbed, mRNA_len_cut = 100):
	'''
	return percentile points of gene body
	mRNA length < mRNA_len_cut will be skipped
	'''
	if refbed is None:
		print >>sys.stderr,"You must specify a bed file representing gene model\n"
		exit(0)
	
	g_percentiles = {}
	transcript_count = 0
	for line in open(refbed,'r'):
		try:
			if line.startswith(('#','track','browser')):continue  
			# Parse fields from gene tabls
			fields = line.split()
			chrom     = fields[0]
			tx_start  = int( fields[1] )
			tx_end    = int( fields[2] )
			geneName      = fields[3]
			strand    = fields[5]
			geneID = '_'.join([str(j) for j in (chrom, tx_start, tx_end, geneName, strand)])
				
			exon_starts = map( int, fields[11].rstrip( ',\n' ).split( ',' ) )
			exon_starts = map((lambda x: x + tx_start ), exon_starts)
			exon_ends = map( int, fields[10].rstrip( ',\n' ).split( ',' ) )
			exon_ends = map((lambda x, y: x + y ), exon_starts, exon_ends)
			transcript_count += 1
		except:
			print >>sys.stderr,"[NOTE:input bed must be 12-column] skipped this line: " + line,
			continue
		gene_all_base=[]
		mRNA_len =0
		flag=0
		for st,end in zip(exon_starts,exon_ends):
			gene_all_base.extend(range(st+1,end+1))		#1-based coordinates on genome
		if len(gene_all_base) < mRNA_len_cut:
			continue
		g_percentiles[geneID] = (chrom, strand, mystat.percentile_list (gene_all_base))	#get 100 points from each gene's coordinates
	printlog("Total " + str(transcript_count) + ' transcripts loaded')
	return g_percentiles
コード例 #5
0
def coverageGeneBody_bigwig(bigFile, refbed, outfile, gtype="png"):
    '''Calculate reads coverage over gene body, from 5'to 3'. each gene will be equally divided
	into 100 regsions. bigFile is bigwig format file'''
    if refbed is None:
        print >> sys.stderr, "You must specify a bed file representing gene model\n"
        exit(0)
    OUT1 = open(outfile + ".geneBodyCoverage_plot.r", 'w')
    OUT2 = open(outfile + ".geneBodyCoverage.txt", 'w')

    bw = BigWigFile(file=open(bigFile))
    print >> sys.stderr, "calculating coverage over gene body ..."
    coverage = collections.defaultdict(int)
    flag = 0
    gene_count = 0
    for line in open(refbed, 'r'):
        try:
            if line.startswith(('#', 'track', 'browser')): continue
            gene_count += 1
            # Parse fields from gene tabls
            fields = line.split()
            chrom = fields[0]
            tx_start = int(fields[1])
            tx_end = int(fields[2])
            geneName = fields[3]
            strand = fields[5]

            exon_starts = map(int, fields[11].rstrip(',\n').split(','))
            exon_starts = map((lambda x: x + tx_start), exon_starts)
            exon_ends = map(int, fields[10].rstrip(',\n').split(','))
            exon_ends = map((lambda x, y: x + y), exon_starts, exon_ends)
        except:
            print >> sys.stderr, "[NOTE:input bed must be 12-column] skipped this line: " + line,
            continue
        gene_all_base = []
        percentile_base = []
        mRNA_len = 0
        flag = 0
        for st, end in zip(exon_starts, exon_ends):
            gene_all_base.extend(range(st + 1, end +
                                       1))  #0-based coordinates on genome
            mRNA_len = len(gene_all_base)
            if mRNA_len < 100:
                flag = 1
                break
        if flag == 1: continue
        if strand == '-':
            gene_all_base.sort(reverse=True)  #deal with gene on minus stand
        else:
            gene_all_base.sort(reverse=False)
        percentile_base = mystat.percentile_list(
            gene_all_base)  #get 101 points from each gene's coordinates

        for i in range(0, len(percentile_base)):
            #try:
            sig = bw.get_as_array(chrom, percentile_base[i] - 1,
                                  percentile_base[i])
            if sig is None: continue
            coverage[i] += np.nan_to_num(sig[0])
            #except:
            #	continue
        print >> sys.stderr, "  %d genes finished\r" % gene_count,

    x_coord = []
    y_coord = []
    print >> OUT2, "percentile\tcount"
    for i in coverage:
        x_coord.append(str(i))
        y_coord.append(str(coverage[i]))
        print >> OUT2, str(i) + '\t' + str(coverage[i])

    print >> OUT1, "%s(\'%s\')" % (gtype,
                                   outfile + ".geneBodyCoverage." + gtype)
    print >> OUT1, "x=0:100"
    print >> OUT1, "y=c(" + ','.join(y_coord) + ')'
    print >> OUT1, "plot(x,y/%s,xlab=\"percentile of gene body (5'->3')\",ylab='average wigsum',type='s')" % gene_count
    print >> OUT1, "dev.off()"
コード例 #6
0
def coverageGeneBody_bigwig(bigFile, refbed, outfile, gtype='png'):
    '''Calculate reads coverage over gene body, from 5'to 3'. each gene will be equally divided
    into 100 regsions. bigFile is bigwig format file'''

    if refbed is None:
        print("You must specify a bed file representing gene model",
              file=sys.stderr)
        exit(0)

    bw = openBigWig(bigFile, 'r')
    # Get chromosomes present in the bigwig file
    chroms = bw.chroms().keys()

    print("calculating coverage over gene body ...", file=sys.stderr)
    coverage = defaultdict(int)
    flag = 0
    gene_count = 0
    with open(refbed, 'r') as handle:
        # Loop through the genes in the BED file
        for line in handle:
            try:
                if line.startswith(('#', 'track', 'browser')):
                    continue

                # Parse fields from gene tabls
                fields = line.split()
                chrom = fields[0]
                tx_start = int(fields[1])
                tx_end = int(fields[2])
                geneName = fields[3]
                strand = fields[5]

                # Skip chromosomes present in the bed file but not present in the bigwig file.
                # This could happen with PATCHES or Unplaced chromosomes.
                if chrom not in chroms:
                    continue

                exon_starts = list(
                    map(int, fields[11].rstrip(',\n').split(',')))
                exon_starts = list(map((lambda x: x + tx_start), exon_starts))
                exon_ends = list(map(int, fields[10].rstrip(',\n').split(',')))
                exon_ends = list(
                    map((lambda x, y: x + y), exon_starts, exon_ends))
            except:
                print(
                    "[NOTE: input bed must be 12-column] skipped this line: " +
                    line,
                    end='\n',
                    file=sys.stderr)
                continue

            # Count gene if it was properly read
            gene_count += 1

            gene_all_base = []
            mRNA_len = 0
            flag = 0
            for st, end in zip(exon_starts, exon_ends):
                gene_all_base.extend(range(st + 1, end +
                                           1))  # 0-based coordinates on genome
                mRNA_len = len(gene_all_base)
                if mRNA_len < 100:
                    flag = 1
                    break
            if flag == 1:
                continue

            # Sort coordinates according to the strand
            if strand == '-':
                gene_all_base.sort(reverse=True)
            else:
                gene_all_base.sort(reverse=False)

            # Get 100 points from each gene's coordinates
            percentile_base = []
            percentile_base = mystat.percentile_list(gene_all_base)

            for i in range(0, len(percentile_base)):
                sig = bw.values(chrom, percentile_base[i] - 1,
                                percentile_base[i])
                coverage[i] += nan_to_num(sig[0])

            print(" \t%d genes finished\r" % gene_count,
                  end=' ',
                  file=sys.stderr)

    # Close bigwig file
    bw.close()
    print("\n", file=sys.stderr)

    x_coord = []
    y_coord = []
    with open(outfile + ".geneBodyCoverage.txt", 'w') as handle:
        handle.write("percentile\tcount\n")
        for i in coverage:
            x_coord.append(str(i))
            y_coord.append(str(coverage[i]))
            handle.write("%i\t%i\n" % (i, coverage[i]))

    with open(outfile + ".geneBodyCoverage_plot.r", 'w') as handle:
        handle.write("%s(\'%s\')\n" %
                     (gtype, outfile + ".geneBodyCoverage." + gtype))
        handle.write("x=1:100\n")
        handle.write("y=c(%s)\n" % ','.join(y_coord))
        handle.write(
            "plot(x, y/%s, xlab=\"percentile of gene body (5'->3')\", ylab='average wigsum', type='s')\n"
            % gene_count)
        handle.write("dev.off()\n")
コード例 #7
0
ファイル: geneBody_coverage2.py プロジェクト: kspham/rnaqual
def coverageGeneBody_bigwig(bigFile,refbed,outfile,gtype="png"):
	'''Calculate reads coverage over gene body, from 5'to 3'. each gene will be equally divided
	into 100 regsions. bigFile is bigwig format file'''
	if refbed is None:
		print >>sys.stderr,"You must specify a bed file representing gene model\n"
		exit(0)
	OUT1 = open(outfile + ".geneBodyCoverage_plot.r",'w')
	OUT2 = open(outfile + ".geneBodyCoverage.txt",'w')
	
	bw = BigWigFile( file = open(bigFile) )
	print >>sys.stderr, "calculating coverage over gene body ..."
	coverage=collections.defaultdict(int)
	flag=0
	gene_count = 0
	for line in open(refbed,'r'):
		try:
			if line.startswith(('#','track','browser')):continue  
			gene_count += 1
           	# Parse fields from gene tabls
			fields = line.split()
			chrom     = fields[0]
			tx_start  = int( fields[1] )
			tx_end    = int( fields[2] )
			geneName      = fields[3]
			strand    = fields[5]
				
			exon_starts = map( int, fields[11].rstrip( ',\n' ).split( ',' ) )
			exon_starts = map((lambda x: x + tx_start ), exon_starts)
			exon_ends = map( int, fields[10].rstrip( ',\n' ).split( ',' ) )
			exon_ends = map((lambda x, y: x + y ), exon_starts, exon_ends);   
		except:
			print >>sys.stderr,"[NOTE:input bed must be 12-column] skipped this line: " + line,
			continue
		gene_all_base=[]
		percentile_base=[]
		mRNA_len =0
		flag=0
		for st,end in zip(exon_starts,exon_ends):
			gene_all_base.extend(range(st+1,end+1))		#0-based coordinates on genome
			mRNA_len = len(gene_all_base)
			if mRNA_len <100:
				flag=1
				break
		if flag==1: continue
		if strand == '-':
			gene_all_base.sort(reverse=True)			#deal with gene on minus stand
		else:
			gene_all_base.sort(reverse=False)
		percentile_base = mystat.percentile_list (gene_all_base)	#get 101 points from each gene's coordinates
			
		for i in range(0,len(percentile_base)):
			#try:
			sig = bw.get_as_array(chrom,percentile_base[i]-1,percentile_base[i])
			if sig is None:continue
			coverage[i] += np.nan_to_num(sig[0])
			#except:
			#	continue
		print >>sys.stderr, "  %d genes finished\r" % gene_count,

	x_coord=[]
	y_coord=[]
	print >>OUT2, "percentile\tcount"
	for i in coverage:
		x_coord.append(str(i))
		y_coord.append(str(coverage[i]))
		print >>OUT2, str(i) + '\t' + str(coverage[i])
		
	print >>OUT1, "%s(\'%s\')" % (gtype, outfile + ".geneBodyCoverage." + gtype)
	print >>OUT1, "x=0:100"
	print >>OUT1, "y=c(" + ','.join(y_coord) + ')'
	print >>OUT1, "plot(x,y/%s,xlab=\"percentile of gene body (5'->3')\",ylab='average wigsum',type='s')" % gene_count
	print >>OUT1, "dev.off()"