def main(): usage = "usage: %prog [options] <gtf file>" parser = OptionParser(usage) parser.add_option( "-c", dest="cds", action="store_true", default=False, help="Use CDS, not exons [Default: %default]" ) (options, args) = parser.parse_args() if len(args) != 1: parser.error("Must provide gtf file") else: gtf_file = args[0] genes = gff.read_genes(gtf_file) for transcript_id in genes: g = genes[transcript_id] if options.cds: block_sizes = ",".join([str(ex.end - ex.start + 1) for ex in g.cds]) block_starts = ",".join([str(ex.start - g.cds[0].start) for ex in g.cds]) cols = [ g.chrom, str(g.cds[0].start - 1), str(g.cds[-1].end), transcript_id, "0", g.strand, "0", "0", "255,0,0", str(len(g.cds)), block_sizes, block_starts, ] else: block_sizes = ",".join([str(ex.end - ex.start + 1) for ex in g.exons]) block_starts = ",".join([str(ex.start - g.exons[0].start) for ex in g.exons]) cols = [ g.chrom, str(g.exons[0].start - 1), str(g.exons[-1].end), transcript_id, "0", g.strand, "0", "0", "255,0,0", str(len(g.exons)), block_sizes, block_starts, ] print "\t".join(cols)
def main(): usage = 'usage: %prog [options] <gtf file>' parser = OptionParser(usage) parser.add_option('-c', dest='cds', action='store_true', default=False, help='Use CDS, not exons [Default: %default]') (options, args) = parser.parse_args() if len(args) != 1: parser.error('Must provide gtf file') else: gtf_file = args[0] genes = gff.read_genes(gtf_file) for transcript_id in genes: g = genes[transcript_id] if options.cds: block_sizes = ','.join( [str(ex.end - ex.start + 1) for ex in g.cds]) block_starts = ','.join( [str(ex.start - g.cds[0].start) for ex in g.cds]) cols = [ g.chrom, str(g.cds[0].start - 1), str(g.cds[-1].end), transcript_id, '0', g.strand, '0', '0', '255,0,0', str(len(g.cds)), block_sizes, block_starts ] else: block_sizes = ','.join( [str(ex.end - ex.start + 1) for ex in g.exons]) block_starts = ','.join( [str(ex.start - g.exons[0].start) for ex in g.exons]) cols = [ g.chrom, str(g.exons[0].start - 1), str(g.exons[-1].end), transcript_id, '0', g.strand, '0', '0', '255,0,0', str(len(g.exons)), block_sizes, block_starts ] print '\t'.join(cols)
def get_tss(gtf_file, upstream, downstream): tss_intervals = {} genes = gff.read_genes(gtf_file) for gid in genes: g = genes[gid] if g.strand == '+': istart = g.exons[0].start-upstream iend = g.exons[0].start+downstream else: istart = g.exons[-1].end-downstream iend = g.exons[-1].end+upstream tss_intervals.setdefault(g.chrom,[]).append((istart,iend,g.strand,gid)) for chrom in tss_intervals: tss_intervals[chrom].sort() return tss_intervals
def get_tss(gtf_file, upstream, downstream): tss_intervals = {} genes = gff.read_genes(gtf_file) for gid in genes: g = genes[gid] if g.strand == '+': istart = g.exons[0].start - upstream iend = g.exons[0].start + downstream else: istart = g.exons[-1].end - downstream iend = g.exons[-1].end + upstream tss_intervals.setdefault(g.chrom, []).append( (istart, iend, g.strand, gid)) for chrom in tss_intervals: tss_intervals[chrom].sort() return tss_intervals
def main(): usage = 'usage: %prog [options] <ref_gtf>' parser = OptionParser(usage) #parser.add_option() parser.add_option('-d', dest='downstream', type='int', default=1000, help='Downstream bp for promoters [Default: %default]') parser.add_option('-f', dest='fpkm_tracking', help='Use cufflinks FPKM estimates to choose the most expressed isoform') parser.add_option('-u', dest='upstream', type='int', default=1000, help='Upstream bp for promoters [Default: %default]') (options,args) = parser.parse_args() if len(args) != 1: parser.error('Must provide reference GTF') else: ref_gtf = args[0] g2t = gff.g2t(ref_gtf) transcripts = gff.read_genes(ref_gtf) source = open(ref_gtf).readline().split()[1] if options.fpkm_tracking: iso_fpkm_tracking = cufflinks.fpkm_tracking(options.fpkm_tracking) for gene_id in g2t: gene_transcripts = list(g2t[gene_id]) gene_strand = transcripts[gene_transcripts[0]].strand if gene_strand not in ['+','-']: print('WARNING: %s discluded for lack of strand' % gene_id, file=sys.stderr) continue # choose TSS if options.fpkm_tracking: # find most expressed isoform promoter_tid = gene_transcripts[0] max_fpkm = stats.geo_mean([1+fpkm for fpkm in iso_fpkm_tracking.gene_expr(promoter_tid)]) for transcript_id in gene_transcripts[1:]: transcript_fpkm = stats.geo_mean([1+fpkm for fpkm in iso_fpkm_tracking.gene_expr(transcript_id)]) if math.isnan(max_fpkm) or transcript_fpkm > max_fpkm: promoter_tid = transcript_id max_fpkm = transcript_fpkm # get isoform tss if gene_strand == '+': tss = transcripts[promoter_tid].exons[0].start else: tss = transcripts[promoter_tid].exons[-1].end else: # find most upstream tss promoter_tid = gene_transcripts[0] if gene_strand == '+': upstream_tss = transcripts[promoter_tid].exons[0].start else: upstream_tss = transcripts[promoter_tid].exons[-1].end for transcript_id in gene_transcripts[1:]: if gene_strand == '+': transcript_pos = transcripts[transcript_id].exons[0].start if transcript_pos < upstream_tss: promoter_tid = transcript_id upstream_tss = transcript_pos else: transcript_pos = transcripts[transcript_id].exons[-1].end if transcript_pos > upstream_tss: promoter_tid = transcript_id upstream_tss = transcript_pos tss = upstream_tss # print promoter from the tss if gene_strand == '+': if tss - options.upstream < 1: print('WARNING: %s discluded for nearness to chromosome end' % gene_id, file=sys.stderr) else: tx = transcripts[promoter_tid] cols = [tx.chrom, source, 'promoter', str(tss-options.upstream), str(tss+options.downstream), '.', tx.strand, '.', gff.kv_gtf(tx.kv)] print('\t'.join(cols)) else: if tss - options.downstream < 1: print('WARNING: %s discluded for nearness to chromosome end' % gene_id, file=sys.stderr) else: tx = transcripts[promoter_tid] cols = [tx.chrom, source, 'promoter', str(tss-options.downstream), str(tss+options.upstream), '.', tx.strand, '.', gff.kv_gtf(tx.kv)] print('\t'.join(cols))
def get_splice_intervals(gtf_file, window): intervals_5p = {} intervals_3p = {} genes = gff.read_genes(gtf_file) for gid in genes: g = genes[gid] if len(g.exons) > 1: if g.strand == '+': # add first 5p site exon = g.exons[0] if not g.chrom in intervals_5p: intervals_5p[g.chrom] = set() intervals_5p[g.chrom].add((exon.end - window/2, exon.end + window/2, g.strand)) # process internal exons for exon in g.exons[1:-1]: # add 3p site if not g.chrom in intervals_3p: intervals_3p[g.chrom] = set() intervals_3p[g.chrom].add((exon.start - window/2, exon.start + window/2, g.strand)) # add 5p site if not g.chrom in intervals_5p: intervals_5p[g.chrom] = set() intervals_5p[g.chrom].add((exon.end - window/2, exon.end + window/2, g.strand)) # add last 3p site exon = g.exons[-1] if not g.chrom in intervals_3p: intervals_3p[g.chrom] = set() intervals_3p[g.chrom].add((exon.start - window/2, exon.start + window/2, g.strand)) else: # add first 5p site exon = g.exons[-1] if not g.chrom in intervals_5p: intervals_5p[g.chrom] = set() intervals_5p[g.chrom].add((exon.end - window/2, exon.end + window/2, g.strand)) # process internal exons (in reverse order, but doesn't matter) for exon in g.exons[1:-1]: # add 3p site if not g.chrom in intervals_3p: intervals_3p[g.chrom] = set() intervals_3p[g.chrom].add((exon.start - window/2, exon.start + window/2, g.strand)) # add 5p site if not g.chrom in intervals_5p: intervals_5p[g.chrom] = set() intervals_5p[g.chrom].add((exon.end - window/2, exon.end + window/2, g.strand)) # add last 3p site exon = g.exons[0] if not g.chrom in intervals_3p: intervals_3p[g.chrom] = set() intervals_3p[g.chrom].add((exon.start - window/2, exon.start + window/2, g.strand)) # convert sets to sorted lists for chrom in intervals_5p: intervals_5p[chrom] = sorted(list(intervals_5p[chrom])) intervals_3p[chrom] = sorted(list(intervals_3p[chrom])) return intervals_5p, intervals_3p
def compute_coverage(anchor_gff, event_files, mode, anchor_is_gtf, bins): ############################################ # initialize ############################################ coverage = initialize_coverage(anchor_gff, mode, anchor_is_gtf, bins) if anchor_is_gtf: # get transcript structures transcripts = gff.read_genes(anchor_gff, key_id='transcript_id') # compute lengths transcript_lengths = {} for tid in transcripts: tx = transcripts[tid] for exon in tx.exons: transcript_lengths[tid] = transcript_lengths.get(tid,0) + exon.end-exon.start+1 else: transcripts = None transcript_lengths = None events = 0 for event_file in event_files: print >> sys.stderr, 'Computing coverage for %s' % event_file ############################################ # preprocess BAM/GFF ############################################ if event_file[-4:] == '.bam': # count fragments and hash multi-mappers multi_maps = {} for aligned_read in pysam.Samfile(event_file, 'rb'): try: nh_tag = aligned_read.opt('NH') except: nh_tag = 1.0 if aligned_read.is_paired: events += 0.5/nh_tag else: events += 1.0/nh_tag if nh_tag > 1: multi_maps[aligned_read.qname] = nh_tag elif event_file[-4:] == '.gff': for line in open(event_file): events += 1 else: print >> sys.stderr, 'Unknown event file format %s' % event_file ############################################ # intersect BAM w/ anchors ############################################ if event_file[-4:] == '.bam': p = subprocess.Popen('intersectBed -split -wo -bed -abam %s -b %s' % (event_file, anchor_gff), shell=True, stdout=subprocess.PIPE) else: p = subprocess.Popen('intersectBed -s -wo -a %s -b %s' % (event_file, anchor_gff), shell=True, stdout=subprocess.PIPE) for line in p.stdout: a = line.split('\t') if event_file[-4:] == '.bam': rstart = int(a[1])+1 # convert back to 1-based gff from bed rend = int(a[2]) rheader = a[3] else: rstart = int(a[3]) rend = int(a[4]) # because intersectBed screws up indels near endpoints if rstart < rend: if event_file[-4:] == '.bam': acol = 12 else: acol = 9 achrom = a[acol] astart = int(a[acol+3]) aend = int(a[acol+4]) astrand = a[acol+6] if anchor_is_gtf: anchor_id = gff.gtf_kv(a[acol+8])['transcript_id'] else: anchor_id = (achrom, astart, aend) # find where to increment inc_start, inc_end = find_inc_coords(anchor_id, astart, aend, astrand, rstart, rend, mode, bins, transcripts, transcript_lengths) if inc_start != None: if event_file[-4:] == '.bam': # find multi-map number, which may require removing a suffix if rheader in multi_maps: mm = multi_maps[rheader] else: rheader_base = rheader[:rheader.rfind('/')] if rheader_base in multi_maps: mm = multi_maps[rheader_base] else: mm = 1.0 else: mm = 1.0 # increment! for i in range(inc_start, inc_end): coverage[anchor_id][i] += 1.0/mm p.communicate() return coverage, events
def main(): usage = 'usage: %prog [options] <ref_gtf> <prerna_gtf>' parser = OptionParser(usage) parser.add_option('-m', dest='max_genes_overlapped', default=None, type='int', help='Don\'t include isoforms that overlap more than this many genes [Default: %default]') (options,args) = parser.parse_args() if len(args) != 2: parser.error('Must provide reference GTF and output prerna GTF') else: ref_gtf = args[0] prerna_gtf = args[1] # read transcripts for filtering/processing transcripts = gff.read_genes(ref_gtf, key_id='transcript_id') # add unspliced single exon transcripts to hash prerna_hash = set() for tid in transcripts: tx = transcripts[tid] if len(tx.exons) == 1: tx_key = (tx.chrom, tx.exons[0].start, tx.exons[0].end, tx.strand) prerna_hash.add(tx_key) # process transcripts prerna_out = open(prerna_gtf, 'w') prerna_index = 0 for tid in transcripts: tx = transcripts[tid] pre_start = tx.exons[0].start pre_end = tx.exons[-1].end pre_key = (tx.chrom, pre_start, pre_end, tx.strand) # print exons for i in range(len(tx.exons)): cols = (tx.chrom, 'dk', 'exon', str(tx.exons[i].start), str(tx.exons[i].end), '.', tx.strand, '.', gff.kv_gtf(tx.kv)) print >> prerna_out, '\t'.join(cols) # print prernas if not pre_key in prerna_hash: prerna_hash.add(pre_key) pre_kv = copy.copy(tx.kv) pre_kv['transcript_id'] = 'PRERNA%d' % prerna_index pre_kv['transcript_type'] = 'prerna' prerna_index += 1 cols = (tx.chrom, 'dk', 'exon', str(pre_start), str(pre_end), '.', tx.strand, '.', gff.kv_gtf(pre_kv)) print >> prerna_out, '\t'.join(cols) prerna_out.close() if options.max_genes_overlapped != None: # intersect with self and compute overlap sets p = subprocess.Popen('intersectBed -wo -s -a %s -b %s' % (prerna_gtf, prerna_gtf), shell=True, stdout=subprocess.PIPE) tx_overlaps = {} for line in p.stdout: a = line.split('\t') kv1 = gff.gtf_kv(a[8]) tid1 = kv1['transcript_id'] if tid1.startswith('PRERNA'): gid1 = kv1['gene_id'] gid2 = gff.gtf_kv(a[17])['gene_id'] if gid1 != gid2: tx_overlaps.setdefault(tid1,set()).add(gid2) p.communicate() # filter into a temp gtf prerna_tmp_fd, prerna_tmp_file = tempfile.mkstemp() prerna_out = open(prerna_tmp_file, 'w') for line in open(prerna_gtf): a = line.split('\t') kv = gff.gtf_kv(a[8]) tid = kv['transcript_id'] if len(tx_overlaps.get(tid,[])) <= options.max_genes_overlapped: print >> prerna_out, line, prerna_out.close() # rewrite temp to the final output prerna_out = open(prerna_gtf, 'w') for line in open(prerna_tmp_file): print >> prerna_out, line, prerna_out.close() os.close(prerna_tmp_fd) os.remove(prerna_tmp_file)
def main(): usage = 'usage: %prog [options] <ref_gtf> <prerna_gtf>' parser = OptionParser(usage) parser.add_option( '-m', dest='max_genes_overlapped', default=None, type='int', help= 'Don\'t include isoforms that overlap more than this many genes [Default: %default]' ) (options, args) = parser.parse_args() if len(args) != 2: parser.error('Must provide reference GTF and output prerna GTF') else: ref_gtf = args[0] prerna_gtf = args[1] # read transcripts for filtering/processing transcripts = gff.read_genes(ref_gtf, key_id='transcript_id') # add unspliced single exon transcripts to hash prerna_hash = set() for tid in transcripts: tx = transcripts[tid] if len(tx.exons) == 1: tx_key = (tx.chrom, tx.exons[0].start, tx.exons[0].end, tx.strand) prerna_hash.add(tx_key) # process transcripts prerna_out = open(prerna_gtf, 'w') prerna_index = 0 for tid in transcripts: tx = transcripts[tid] pre_start = tx.exons[0].start pre_end = tx.exons[-1].end pre_key = (tx.chrom, pre_start, pre_end, tx.strand) # print exons for i in range(len(tx.exons)): cols = (tx.chrom, 'dk', 'exon', str(tx.exons[i].start), str(tx.exons[i].end), '.', tx.strand, '.', gff.kv_gtf(tx.kv)) print >> prerna_out, '\t'.join(cols) # print prernas if not pre_key in prerna_hash: prerna_hash.add(pre_key) pre_kv = copy.copy(tx.kv) pre_kv['transcript_id'] = 'PRERNA%d' % prerna_index pre_kv['transcript_type'] = 'prerna' prerna_index += 1 cols = (tx.chrom, 'dk', 'exon', str(pre_start), str(pre_end), '.', tx.strand, '.', gff.kv_gtf(pre_kv)) print >> prerna_out, '\t'.join(cols) prerna_out.close() if options.max_genes_overlapped != None: # intersect with self and compute overlap sets p = subprocess.Popen('intersectBed -wo -s -a %s -b %s' % (prerna_gtf, prerna_gtf), shell=True, stdout=subprocess.PIPE) tx_overlaps = {} for line in p.stdout: a = line.split('\t') kv1 = gff.gtf_kv(a[8]) tid1 = kv1['transcript_id'] if tid1.startswith('PRERNA'): gid1 = kv1['gene_id'] gid2 = gff.gtf_kv(a[17])['gene_id'] if gid1 != gid2: tx_overlaps.setdefault(tid1, set()).add(gid2) p.communicate() # filter into a temp gtf prerna_tmp_fd, prerna_tmp_file = tempfile.mkstemp() prerna_out = open(prerna_tmp_file, 'w') for line in open(prerna_gtf): a = line.split('\t') kv = gff.gtf_kv(a[8]) tid = kv['transcript_id'] if len(tx_overlaps.get(tid, [])) <= options.max_genes_overlapped: print >> prerna_out, line, prerna_out.close() # rewrite temp to the final output prerna_out = open(prerna_gtf, 'w') for line in open(prerna_tmp_file): print >> prerna_out, line, prerna_out.close() os.close(prerna_tmp_fd) os.remove(prerna_tmp_file)
def compute_coverage(anchor_gff, event_files, mode, anchor_is_gtf, bins): ############################################ # initialize ############################################ coverage = initialize_coverage(anchor_gff, mode, anchor_is_gtf, bins) if anchor_is_gtf: # get transcript structures transcripts = gff.read_genes(anchor_gff, key_id='transcript_id') # compute lengths transcript_lengths = {} for tid in transcripts: tx = transcripts[tid] for exon in tx.exons: transcript_lengths[tid] = transcript_lengths.get( tid, 0) + exon.end - exon.start + 1 else: transcripts = None transcript_lengths = None events = 0 for event_file in event_files: print >> sys.stderr, 'Computing coverage for %s in %s' % (event_file, anchor_gff) ############################################ # preprocess BAM/GFF ############################################ if event_file[-4:] == '.bam': # count fragments and hash multi-mappers multi_maps = {} for aligned_read in pysam.Samfile(event_file, 'rb'): try: nh_tag = aligned_read.opt('NH') except: nh_tag = 1.0 if aligned_read.is_paired: events += 0.5 / nh_tag else: events += 1.0 / nh_tag if nh_tag > 1: multi_maps[aligned_read.qname] = nh_tag elif event_file[-4:] == '.gff': for line in open(event_file): events += 1 else: print >> sys.stderr, 'Unknown event file format %s' % event_file ############################################ # intersect BAM w/ anchors ############################################ if event_file[-4:] == '.bam': p = subprocess.Popen( 'intersectBed -split -wo -bed -abam %s -b %s' % (event_file, anchor_gff), shell=True, stdout=subprocess.PIPE) else: p = subprocess.Popen('intersectBed -s -wo -a %s -b %s' % (event_file, anchor_gff), shell=True, stdout=subprocess.PIPE) for line in p.stdout: a = line.split('\t') if event_file[-4:] == '.bam': rstart = int(a[1]) + 1 # convert back to 1-based gff from bed rend = int(a[2]) rheader = a[3] else: rstart = int(a[3]) rend = int(a[4]) # because intersectBed screws up indels near endpoints if rstart < rend: if event_file[-4:] == '.bam': acol = 12 else: acol = 9 achrom = a[acol] astart = int(a[acol + 3]) aend = int(a[acol + 4]) astrand = a[acol + 6] if anchor_is_gtf: anchor_id = gff.gtf_kv(a[acol + 8])['transcript_id'] else: anchor_id = '%s:%d-%d' % (achrom, astart, aend) # find where to increment inc_start, inc_end = find_inc_coords(anchor_id, astart, aend, astrand, rstart, rend, mode, bins, transcripts, transcript_lengths) if inc_start != None: if event_file[-4:] == '.bam': # find multi-map number, which may require removing a suffix if rheader in multi_maps: mm = multi_maps[rheader] else: rheader_base = rheader[:rheader.rfind('/')] if rheader_base in multi_maps: mm = multi_maps[rheader_base] else: mm = 1.0 else: mm = 1.0 # increment! for i in range(inc_start, inc_end): coverage[anchor_id][i] += 1.0 / mm p.communicate() return coverage, events
def main(): usage = 'usage: %prog [options] <ref_gtf>' parser = OptionParser(usage) #parser.add_option() parser.add_option('-d', dest='downstream', type='int', default=1000, help='Downstream bp for promoters [Default: %default]') parser.add_option( '-f', dest='fpkm_tracking', help='Use cufflinks FPKM estimates to choose the most expressed isoform' ) parser.add_option('-u', dest='upstream', type='int', default=1000, help='Upstream bp for promoters [Default: %default]') (options, args) = parser.parse_args() if len(args) != 1: parser.error('Must provide reference GTF') else: ref_gtf = args[0] g2t = gff.g2t(ref_gtf) transcripts = gff.read_genes(ref_gtf) source = open(ref_gtf).readline().split()[1] if options.fpkm_tracking: iso_fpkm_tracking = cufflinks.fpkm_tracking(options.fpkm_tracking) for gene_id in g2t: gene_transcripts = list(g2t[gene_id]) gene_strand = transcripts[gene_transcripts[0]].strand if gene_strand not in ['+', '-']: print >> sys.stderr, 'WARNING: %s discluded for lack of strand' % gene_id continue # choose TSS if options.fpkm_tracking: # find most expressed isoform promoter_tid = gene_transcripts[0] max_fpkm = stats.geo_mean([ 1 + fpkm for fpkm in iso_fpkm_tracking.gene_expr(promoter_tid) ]) for transcript_id in gene_transcripts[1:]: transcript_fpkm = stats.geo_mean([ 1 + fpkm for fpkm in iso_fpkm_tracking.gene_expr(transcript_id) ]) if math.isnan(max_fpkm) or transcript_fpkm > max_fpkm: promoter_tid = transcript_id max_fpkm = transcript_fpkm # get isoform tss if gene_strand == '+': tss = transcripts[promoter_tid].exons[0].start else: tss = transcripts[promoter_tid].exons[-1].end else: # find most upstream tss promoter_tid = gene_transcripts[0] if gene_strand == '+': upstream_tss = transcripts[promoter_tid].exons[0].start else: upstream_tss = transcripts[promoter_tid].exons[-1].end for transcript_id in gene_transcripts[1:]: if gene_strand == '+': transcript_pos = transcripts[transcript_id].exons[0].start if transcript_pos < upstream_tss: promoter_tid = transcript_id upstream_tss = transcript_pos else: transcript_pos = transcripts[transcript_id].exons[-1].end if transcript_pos > upstream_tss: promoter_tid = transcript_id upstream_tss = transcript_pos tss = upstream_tss # print promoter from the tss if gene_strand == '+': if tss - options.upstream < 1: print >> sys.stderr, 'WARNING: %s discluded for nearness to chromosome end' % gene_id else: tx = transcripts[promoter_tid] cols = [ tx.chrom, source, 'promoter', str(tss - options.upstream), str(tss + options.downstream), '.', tx.strand, '.', gff.kv_gtf(tx.kv) ] print '\t'.join(cols) else: if tss - options.downstream < 1: print >> sys.stderr, 'WARNING: %s discluded for nearness to chromosome end' % gene_id else: tx = transcripts[promoter_tid] cols = [ tx.chrom, source, 'promoter', str(tss - options.downstream), str(tss + options.upstream), '.', tx.strand, '.', gff.kv_gtf(tx.kv) ] print '\t'.join(cols)
def get_splice_intervals(gtf_file, window): intervals_5p = {} intervals_3p = {} genes = gff.read_genes(gtf_file) for gid in genes: g = genes[gid] if len(g.exons) > 1: if g.strand == "+": # add first 5p site exon = g.exons[0] if not g.chrom in intervals_5p: intervals_5p[g.chrom] = set() intervals_5p[g.chrom].add((exon.end - window / 2, exon.end + window / 2, g.strand)) # process internal exons for exon in g.exons[1:-1]: # add 3p site if not g.chrom in intervals_3p: intervals_3p[g.chrom] = set() intervals_3p[g.chrom].add((exon.start - window / 2, exon.start + window / 2, g.strand)) # add 5p site if not g.chrom in intervals_5p: intervals_5p[g.chrom] = set() intervals_5p[g.chrom].add((exon.end - window / 2, exon.end + window / 2, g.strand)) # add last 3p site exon = g.exons[-1] if not g.chrom in intervals_3p: intervals_3p[g.chrom] = set() intervals_3p[g.chrom].add((exon.start - window / 2, exon.start + window / 2, g.strand)) else: # add first 5p site exon = g.exons[-1] if not g.chrom in intervals_5p: intervals_5p[g.chrom] = set() intervals_5p[g.chrom].add((exon.end - window / 2, exon.end + window / 2, g.strand)) # process internal exons (in reverse order, but doesn't matter) for exon in g.exons[1:-1]: # add 3p site if not g.chrom in intervals_3p: intervals_3p[g.chrom] = set() intervals_3p[g.chrom].add((exon.start - window / 2, exon.start + window / 2, g.strand)) # add 5p site if not g.chrom in intervals_5p: intervals_5p[g.chrom] = set() intervals_5p[g.chrom].add((exon.end - window / 2, exon.end + window / 2, g.strand)) # add last 3p site exon = g.exons[0] if not g.chrom in intervals_3p: intervals_3p[g.chrom] = set() intervals_3p[g.chrom].add((exon.start - window / 2, exon.start + window / 2, g.strand)) # convert sets to sorted lists for chrom in intervals_5p: intervals_5p[chrom] = sorted(list(intervals_5p[chrom])) intervals_3p[chrom] = sorted(list(intervals_3p[chrom])) return intervals_5p, intervals_3p