Example #1
0
def process_gene_model(gene_model):
	print >>sys.stderr, "processing " + gene_model + ' ...',
	obj = BED.ParseBED(gene_model)
	utr_3 = obj.getUTR(utr=3)
	utr_5 = obj.getUTR(utr=5)
	cds_exon = obj.getCDSExon()
	intron = obj.getIntron()
	
	intron = BED.unionBed3(intron)
	cds_exon=BED.unionBed3(cds_exon)
	utr_5 = BED.unionBed3(utr_5)
	utr_3 = BED.unionBed3(utr_3)
	
	utr_5 = BED.subtractBed3(utr_5,cds_exon)
	utr_3 = BED.subtractBed3(utr_3,cds_exon)
	intron = BED.subtractBed3(intron,cds_exon)
	intron = BED.subtractBed3(intron,utr_5)
	intron = BED.subtractBed3(intron,utr_3)
	
	intergenic_up_1kb = obj.getIntergenic(direction="up",size=1000)
	intergenic_down_1kb = obj.getIntergenic(direction="down",size=1000)
	intergenic_up_5kb = obj.getIntergenic(direction="up",size=5000)
	intergenic_down_5kb = obj.getIntergenic(direction="down",size=5000)	
	intergenic_up_10kb = obj.getIntergenic(direction="up",size=10000)
	intergenic_down_10kb = obj.getIntergenic(direction="down",size=10000)
	
	#merge integenic region
	intergenic_up_1kb=BED.unionBed3(intergenic_up_1kb)
	intergenic_up_5kb=BED.unionBed3(intergenic_up_5kb)
	intergenic_up_10kb=BED.unionBed3(intergenic_up_10kb)
	intergenic_down_1kb=BED.unionBed3(intergenic_down_1kb)
	intergenic_down_5kb=BED.unionBed3(intergenic_down_5kb)
	intergenic_down_10kb=BED.unionBed3(intergenic_down_10kb)	
	
	#purify intergenic region
	intergenic_up_1kb=BED.subtractBed3(intergenic_up_1kb,cds_exon)
	intergenic_up_1kb=BED.subtractBed3(intergenic_up_1kb,utr_5)
	intergenic_up_1kb=BED.subtractBed3(intergenic_up_1kb,utr_3)
	intergenic_up_1kb=BED.subtractBed3(intergenic_up_1kb,intron)
	intergenic_down_1kb=BED.subtractBed3(intergenic_down_1kb,cds_exon)
	intergenic_down_1kb=BED.subtractBed3(intergenic_down_1kb,utr_5)
	intergenic_down_1kb=BED.subtractBed3(intergenic_down_1kb,utr_3)
	intergenic_down_1kb=BED.subtractBed3(intergenic_down_1kb,intron)	

	#purify intergenic region
	intergenic_up_5kb=BED.subtractBed3(intergenic_up_5kb,cds_exon)
	intergenic_up_5kb=BED.subtractBed3(intergenic_up_5kb,utr_5)
	intergenic_up_5kb=BED.subtractBed3(intergenic_up_5kb,utr_3)
	intergenic_up_5kb=BED.subtractBed3(intergenic_up_5kb,intron)
	intergenic_down_5kb=BED.subtractBed3(intergenic_down_5kb,cds_exon)
	intergenic_down_5kb=BED.subtractBed3(intergenic_down_5kb,utr_5)
	intergenic_down_5kb=BED.subtractBed3(intergenic_down_5kb,utr_3)
	intergenic_down_5kb=BED.subtractBed3(intergenic_down_5kb,intron)	
	
	#purify intergenic region
	intergenic_up_10kb=BED.subtractBed3(intergenic_up_10kb,cds_exon)
	intergenic_up_10kb=BED.subtractBed3(intergenic_up_10kb,utr_5)
	intergenic_up_10kb=BED.subtractBed3(intergenic_up_10kb,utr_3)
	intergenic_up_10kb=BED.subtractBed3(intergenic_up_10kb,intron)
	intergenic_down_10kb=BED.subtractBed3(intergenic_down_10kb,cds_exon)
	intergenic_down_10kb=BED.subtractBed3(intergenic_down_10kb,utr_5)
	intergenic_down_10kb=BED.subtractBed3(intergenic_down_10kb,utr_3)
	intergenic_down_10kb=BED.subtractBed3(intergenic_down_10kb,intron)	
	
	#build intervalTree 
	cds_exon_ranges = build_bitsets(cds_exon)
	utr_5_ranges = build_bitsets(utr_5)
	utr_3_ranges = build_bitsets(utr_3)
	intron_ranges = build_bitsets(intron)
	interg_ranges_up_1kb_ranges = build_bitsets(intergenic_up_1kb)
	interg_ranges_up_5kb_ranges = build_bitsets(intergenic_up_5kb)
	interg_ranges_up_10kb_ranges = build_bitsets(intergenic_up_10kb)
	interg_ranges_down_1kb_ranges = build_bitsets(intergenic_down_1kb)
	interg_ranges_down_5kb_ranges = build_bitsets(intergenic_down_5kb)
	interg_ranges_down_10kb_ranges = build_bitsets(intergenic_down_10kb)
	
	exon_size = cal_size(cds_exon)
	intron_size = cal_size(intron)
	utr3_size = cal_size(utr_3)
	utr5_size = cal_size(utr_5)
	int_up1k_size = cal_size(intergenic_up_1kb)
	int_up5k_size = cal_size(intergenic_up_5kb)
	int_up10k_size = cal_size(intergenic_up_10kb)
	int_down1k_size = cal_size(intergenic_down_1kb)
	int_down5k_size = cal_size(intergenic_down_5kb)
	int_down10k_size = cal_size(intergenic_down_10kb)
	
	print >>sys.stderr, "Done"
	return (cds_exon_ranges,intron_ranges,utr_5_ranges,utr_3_ranges,\
			interg_ranges_up_1kb_ranges,interg_ranges_up_5kb_ranges,interg_ranges_up_10kb_ranges,\
			interg_ranges_down_1kb_ranges,interg_ranges_down_5kb_ranges,interg_ranges_down_10kb_ranges,\
			exon_size,intron_size,utr5_size,utr3_size,\
			int_up1k_size,int_up5k_size,int_up10k_size,\
			int_down1k_size,int_down5k_size,int_down10k_size)
def process_gene_model(gene_model):
	print >>sys.stderr, "processing " + gene_model + ' ...',
	obj = BED.ParseBED(gene_model)
	utr_3 = obj.getUTR(utr=3)
	utr_5 = obj.getUTR(utr=5)
	cds_exon = obj.getCDSExon()
	intron = obj.getIntron()
	
	intron = BED.unionBed3(intron)
	cds_exon=BED.unionBed3(cds_exon)
	utr_5 = BED.unionBed3(utr_5)
	utr_3 = BED.unionBed3(utr_3)
	
	utr_5 = BED.subtractBed3(utr_5,cds_exon)
	utr_3 = BED.subtractBed3(utr_3,cds_exon)
	intron = BED.subtractBed3(intron,cds_exon)
	intron = BED.subtractBed3(intron,utr_5)
	intron = BED.subtractBed3(intron,utr_3)
	
	intergenic_up_1kb = obj.getIntergenic(direction="up",size=1000)
	intergenic_down_1kb = obj.getIntergenic(direction="down",size=1000)
	intergenic_up_5kb = obj.getIntergenic(direction="up",size=5000)
	intergenic_down_5kb = obj.getIntergenic(direction="down",size=5000)	
	intergenic_up_10kb = obj.getIntergenic(direction="up",size=10000)
	intergenic_down_10kb = obj.getIntergenic(direction="down",size=10000)
	
	#merge integenic region
	intergenic_up_1kb=BED.unionBed3(intergenic_up_1kb)
	intergenic_up_5kb=BED.unionBed3(intergenic_up_5kb)
	intergenic_up_10kb=BED.unionBed3(intergenic_up_10kb)
	intergenic_down_1kb=BED.unionBed3(intergenic_down_1kb)
	intergenic_down_5kb=BED.unionBed3(intergenic_down_5kb)
	intergenic_down_10kb=BED.unionBed3(intergenic_down_10kb)	
	
	#purify intergenic region
	intergenic_up_1kb=BED.subtractBed3(intergenic_up_1kb,cds_exon)
	intergenic_up_1kb=BED.subtractBed3(intergenic_up_1kb,utr_5)
	intergenic_up_1kb=BED.subtractBed3(intergenic_up_1kb,utr_3)
	intergenic_up_1kb=BED.subtractBed3(intergenic_up_1kb,intron)
	intergenic_down_1kb=BED.subtractBed3(intergenic_down_1kb,cds_exon)
	intergenic_down_1kb=BED.subtractBed3(intergenic_down_1kb,utr_5)
	intergenic_down_1kb=BED.subtractBed3(intergenic_down_1kb,utr_3)
	intergenic_down_1kb=BED.subtractBed3(intergenic_down_1kb,intron)	

	#purify intergenic region
	intergenic_up_5kb=BED.subtractBed3(intergenic_up_5kb,cds_exon)
	intergenic_up_5kb=BED.subtractBed3(intergenic_up_5kb,utr_5)
	intergenic_up_5kb=BED.subtractBed3(intergenic_up_5kb,utr_3)
	intergenic_up_5kb=BED.subtractBed3(intergenic_up_5kb,intron)
	intergenic_down_5kb=BED.subtractBed3(intergenic_down_5kb,cds_exon)
	intergenic_down_5kb=BED.subtractBed3(intergenic_down_5kb,utr_5)
	intergenic_down_5kb=BED.subtractBed3(intergenic_down_5kb,utr_3)
	intergenic_down_5kb=BED.subtractBed3(intergenic_down_5kb,intron)	
	
	#purify intergenic region
	intergenic_up_10kb=BED.subtractBed3(intergenic_up_10kb,cds_exon)
	intergenic_up_10kb=BED.subtractBed3(intergenic_up_10kb,utr_5)
	intergenic_up_10kb=BED.subtractBed3(intergenic_up_10kb,utr_3)
	intergenic_up_10kb=BED.subtractBed3(intergenic_up_10kb,intron)
	intergenic_down_10kb=BED.subtractBed3(intergenic_down_10kb,cds_exon)
	intergenic_down_10kb=BED.subtractBed3(intergenic_down_10kb,utr_5)
	intergenic_down_10kb=BED.subtractBed3(intergenic_down_10kb,utr_3)
	intergenic_down_10kb=BED.subtractBed3(intergenic_down_10kb,intron)	
	
	#build intervalTree 
	cds_exon_ranges = build_bitsets(cds_exon)
	utr_5_ranges = build_bitsets(utr_5)
	utr_3_ranges = build_bitsets(utr_3)
	intron_ranges = build_bitsets(intron)
	interg_ranges_up_1kb_ranges = build_bitsets(intergenic_up_1kb)
	interg_ranges_up_5kb_ranges = build_bitsets(intergenic_up_5kb)
	interg_ranges_up_10kb_ranges = build_bitsets(intergenic_up_10kb)
	interg_ranges_down_1kb_ranges = build_bitsets(intergenic_down_1kb)
	interg_ranges_down_5kb_ranges = build_bitsets(intergenic_down_5kb)
	interg_ranges_down_10kb_ranges = build_bitsets(intergenic_down_10kb)
	
	exon_size = cal_size(cds_exon)
	intron_size = cal_size(intron)
	utr3_size = cal_size(utr_3)
	utr5_size = cal_size(utr_5)
	int_up1k_size = cal_size(intergenic_up_1kb)
	int_up5k_size = cal_size(intergenic_up_5kb)
	int_up10k_size = cal_size(intergenic_up_10kb)
	int_down1k_size = cal_size(intergenic_down_1kb)
	int_down5k_size = cal_size(intergenic_down_5kb)
	int_down10k_size = cal_size(intergenic_down_10kb)
	
	print >>sys.stderr, "Done"
	return (cds_exon_ranges,intron_ranges,utr_5_ranges,utr_3_ranges,\
			interg_ranges_up_1kb_ranges,interg_ranges_up_5kb_ranges,interg_ranges_up_10kb_ranges,\
			interg_ranges_down_1kb_ranges,interg_ranges_down_5kb_ranges,interg_ranges_down_10kb_ranges,\
			exon_size,intron_size,utr5_size,utr3_size,\
			int_up1k_size,int_up5k_size,int_up10k_size,\
			int_down1k_size,int_down5k_size,int_down10k_size)
Example #3
0
def main():
	usage="%prog [options]" + '\n' + __doc__ + "\n"
	parser = OptionParser(usage,version="%prog " + __version__)
	parser.add_option("-i","--input-file",action="store",type="string",dest="input_file",help="Input file in SAM format. Use \"-\" represents standard input [required]")
	parser.add_option("-r","--refgene",action="store",type="string",dest="ref_gene_model",help="Reference gene model in bed format. [required]")
	(options,args)=parser.parse_args()
		
	if not (options.input_file and options.ref_gene_model):
		parser.print_help()
		sys.exit(0)
	if not os.path.exists(options.ref_gene_model):
		print >>sys.stderr, '\n\n' + options.ref_gene_model + " does NOT exists" + '\n'
		#parser.print_help()
		sys.exit(0)
	if os.path.exists(options.input_file):
		file_obj=open(options.input_file)
		pass
	elif options.input_file == '-':
		file_obj=sys.stdin
		pass
	else:
		print >>sys.stderr, '\n\n' + options.input_file + " does NOT exists" + '\n'
		#parser.print_help()
		sys.exit(0)		
	print >>sys.stderr, "processing " + options.ref_gene_model + ' ...',
	obj = BED.ParseBED(options.ref_gene_model)
	utr_3 = obj.getUTR(utr=3)
	utr_5 = obj.getUTR(utr=5)
	cds_exon = obj.getCDSExon()
	intron = obj.getIntron()
	
	intron = BED.unionBed3(intron)
	cds_exon=BED.unionBed3(cds_exon)
	utr_5 = BED.unionBed3(utr_5)
	utr_3 = BED.unionBed3(utr_3)
	
	utr_5 = BED.subtractBed3(utr_5,cds_exon)
	utr_3 = BED.subtractBed3(utr_3,cds_exon)
	intron = BED.subtractBed3(intron,cds_exon)
	intron = BED.subtractBed3(intron,utr_5)
	intron = BED.subtractBed3(intron,utr_3)
	
	intergenic_up_1kb = obj.getIntergenic(direction="up",size=1000)
	intergenic_down_1kb = obj.getIntergenic(direction="down",size=1000)
	intergenic_up_5kb = obj.getIntergenic(direction="up",size=5000)
	intergenic_down_5kb = obj.getIntergenic(direction="down",size=5000)	
	intergenic_up_10kb = obj.getIntergenic(direction="up",size=10000)
	intergenic_down_10kb = obj.getIntergenic(direction="down",size=10000)
	
	#merge integenic region
	intergenic_up_1kb=BED.unionBed3(intergenic_up_1kb)
	intergenic_up_5kb=BED.unionBed3(intergenic_up_5kb)
	intergenic_up_10kb=BED.unionBed3(intergenic_up_10kb)
	intergenic_down_1kb=BED.unionBed3(intergenic_down_1kb)
	intergenic_down_5kb=BED.unionBed3(intergenic_down_5kb)
	intergenic_down_10kb=BED.unionBed3(intergenic_down_10kb)	
	
	#purify intergenic region
	intergenic_up_1kb=BED.subtractBed3(intergenic_up_1kb,cds_exon)
	intergenic_up_1kb=BED.subtractBed3(intergenic_up_1kb,utr_5)
	intergenic_up_1kb=BED.subtractBed3(intergenic_up_1kb,utr_3)
	intergenic_up_1kb=BED.subtractBed3(intergenic_up_1kb,intron)
	intergenic_down_1kb=BED.subtractBed3(intergenic_down_1kb,cds_exon)
	intergenic_down_1kb=BED.subtractBed3(intergenic_down_1kb,utr_5)
	intergenic_down_1kb=BED.subtractBed3(intergenic_down_1kb,utr_3)
	intergenic_down_1kb=BED.subtractBed3(intergenic_down_1kb,intron)	

	#purify intergenic region
	intergenic_up_5kb=BED.subtractBed3(intergenic_up_5kb,cds_exon)
	intergenic_up_5kb=BED.subtractBed3(intergenic_up_5kb,utr_5)
	intergenic_up_5kb=BED.subtractBed3(intergenic_up_5kb,utr_3)
	intergenic_up_5kb=BED.subtractBed3(intergenic_up_5kb,intron)
	intergenic_down_5kb=BED.subtractBed3(intergenic_down_5kb,cds_exon)
	intergenic_down_5kb=BED.subtractBed3(intergenic_down_5kb,utr_5)
	intergenic_down_5kb=BED.subtractBed3(intergenic_down_5kb,utr_3)
	intergenic_down_5kb=BED.subtractBed3(intergenic_down_5kb,intron)	
	
	#purify intergenic region
	intergenic_up_10kb=BED.subtractBed3(intergenic_up_10kb,cds_exon)
	intergenic_up_10kb=BED.subtractBed3(intergenic_up_10kb,utr_5)
	intergenic_up_10kb=BED.subtractBed3(intergenic_up_10kb,utr_3)
	intergenic_up_10kb=BED.subtractBed3(intergenic_up_10kb,intron)
	intergenic_down_10kb=BED.subtractBed3(intergenic_down_10kb,cds_exon)
	intergenic_down_10kb=BED.subtractBed3(intergenic_down_10kb,utr_5)
	intergenic_down_10kb=BED.subtractBed3(intergenic_down_10kb,utr_3)
	intergenic_down_10kb=BED.subtractBed3(intergenic_down_10kb,intron)	
	
	print >>sys.stderr, "Done"
	
	ranges={}
	totalReads=0
	spliceReads=0
	cUR=0
	multiMapReads=0
	
	print >>sys.stderr, "reading SAM file",
	for line in file_obj:
		if line.startswith("@"):continue
		fields=line.rstrip('\n ').split()
		flagCode=string.atoi(fields[1])
		if (flagCode & 0x0004) != 0: continue		#skip unmap reads
		totalReads +=1
		if not SAM.ParseSAM._uniqueHit_pat.search(line):		#skip multiple mapped reads
			multiMapReads +=1
			continue

		chrom = fields[2].upper()
		chromStart = string.atoi(fields[3])-1
		comb=[int(i) for i in SAM.ParseSAM._splicedHit_pat.findall(fields[5])]	#"9M4721N63M3157N8M" return ['9', '4721', '63', '3157', '8']
		cUR += (len(comb) +1)/2
		if(len(comb)>1):
			spliceReads += 1
		blockStart=[]
		blockSize=[]
			
		for i in range(0,len(comb),2):
			blockStart.append(chromStart + sum(comb[:i]) )
				
		for i in range(0,len(comb),2):
			blockSize.append(comb[i])
			
		for st,size in zip(blockStart,blockSize):
			mid = int(st) + (size/2)
			if chrom not in ranges:
				ranges[chrom] = Intersecter()
			else:
				ranges[chrom].add_interval( Interval( mid, mid ) )
	print >>sys.stderr, "Done"
	print >>sys.stderr, "Total Reads: " + str(totalReads)
	print >>sys.stderr, "Multiple Hits: " + str(multiMapReads)
	print >>sys.stderr, "Unique Hits: " + str(totalReads-multiMapReads)
	print >>sys.stderr, "Spliced Hits: " + str(spliceReads)
	print >>sys.stderr, "Total fragments: " + str(cUR)
	
	
	
	print >>sys.stderr, "\nAssignning reads ...",
	intron_read=0
	intron_base=0
	cds_exon_read=0
	cds_exon_base=0
	utr_5_read=0
	utr_5_base=0
	utr_3_read=0
	utr_3_base=0
	
	intergenic_up1kb_base=0
	intergenic_up1kb_read=0
	intergenic_down1kb_base=0
	intergenic_down1kb_read=0
	intergenic_up5kb_base=0
	intergenic_up5kb_read=0
	intergenic_down5kb_base=0
	intergenic_down5kb_read=0
	intergenic_up10kb_base=0
	intergenic_up10kb_read=0
	intergenic_down10kb_base=0
	intergenic_down10kb_read=0	
	
	(intron_base,intron_read) = base_read(intron,ranges)
	(cds_exon_base,cds_exon_read) = base_read(cds_exon,ranges)
	(utr_5_base,utr_5_read) = base_read(utr_5,ranges)
	(utr_3_base,utr_3_read) = base_read(utr_3,ranges)
	(intergenic_up1kb_base, intergenic_up1kb_read) = base_read(intergenic_up_1kb,ranges)
	(intergenic_up5kb_base, intergenic_up5kb_read) = base_read(intergenic_up_5kb,ranges)
	(intergenic_up10kb_base, intergenic_up10kb_read) = base_read(intergenic_up_10kb,ranges)
	(intergenic_down1kb_base, intergenic_down1kb_read) = base_read(intergenic_down_1kb,ranges)
	(intergenic_down5kb_base, intergenic_down5kb_read) = base_read(intergenic_down_5kb,ranges)
	(intergenic_down10kb_base, intergenic_down10kb_read) = base_read(intergenic_down_10kb,ranges)
	
	print >>sys.stderr, "Done"
	
	print >>sys.stderr, "========================================================="
	print >>sys.stderr, "Group\tTotal_bases\tReads_count\tReads/Kb"
	print >>sys.stderr, "CDS Exons:\t%d\t%d\t%5.2f" % (cds_exon_base,cds_exon_read,cds_exon_read*1000.0/cds_exon_base)
	print >>sys.stderr, "5'UTR Exons:\t%d\t%d\t%5.2f" % (utr_5_base,utr_5_read, utr_5_read*1000.0/utr_5_base)
	print >>sys.stderr, "3'UTR Exons:\t%d\t%d\t%5.2f" % (utr_3_base,utr_3_read, utr_3_read*1000.0/utr_3_base)
	print >>sys.stderr, "Intronic region:\t%d\t%d\t%5.2f" % (intron_base,intron_read,intron_read*1000.0/intron_base)
	
	print >>sys.stderr, "TSS up 1kb:\t%d\t%d\t%5.2f" % (intergenic_up1kb_base, intergenic_up1kb_read, intergenic_up1kb_read*1000.0/intergenic_up1kb_base)
	print >>sys.stderr, "TSS up 5kb:\t%d\t%d\t%5.2f" % (intergenic_up5kb_base, intergenic_up5kb_read, intergenic_up5kb_read*1000.0/intergenic_up5kb_base)
	print >>sys.stderr, "TSS up 10kb:\t%d\t%d\t%5.2f" % (intergenic_up10kb_base, intergenic_up10kb_read, intergenic_up10kb_read*1000.0/intergenic_up10kb_base)
	print >>sys.stderr, "TES down 1kb:\t%d\t%d\t%5.2f" % (intergenic_down1kb_base, intergenic_down1kb_read, intergenic_down1kb_read*1000.0/intergenic_down1kb_base)
	print >>sys.stderr, "TES down 5kb:\t%d\t%d\t%5.2f" % (intergenic_down5kb_base, intergenic_down5kb_read, intergenic_down5kb_read*1000.0/intergenic_down5kb_base)	
	print >>sys.stderr, "TES down 10kb:\t%d\t%d\t%5.2f" % (intergenic_down10kb_base, intergenic_down10kb_read, intergenic_down10kb_read*1000.0/intergenic_down10kb_base)
	print >>sys.stderr, "========================================================="