def transform_third_codon(start, end, intervals_with_gff): """transform: only return nucleotide positions in window (start, end) that are in third codon position. """ intervals = [] for istart, iend, gff in intervals_with_gff: if gff.frame == ".": raise ValueError("need a frame for third codon positions.") # frame = nucleotides from start to next codon frame = int(gff.frame) # to make life easier, convert to 0-based coordinates, # with zero starting at first position in window # re-arrange positions on negative strand if Genomics.IsNegativeStrand(gff.strand): # convert to negative strand coordinates counting from 0 coordinate_offset = end reverse = True istart, iend = end - iend, end - istart else: istart, iend = istart - start, iend - start reverse = False coordinate_offset = start # make sure that you start on a second codon position and within window if istart < 0: frame = (frame + istart) % 3 istart = 0 if frame != 0: istart -= (3 - frame) istart += 2 iend = min(iend, end - start) for x in range(istart, iend, 3): if reverse: c = coordinate_offset - x - 1 else: c = coordinate_offset + x intervals.append((c, c + 1)) return Intervals.combineIntervals(intervals)
def annotateGREATDomains(iterator, fasta, options): """build great domains extend from TSS a basal region. """ gene_iterator = GTF.gene_iterator(iterator) counter = E.Counter() upstream, downstream = options.upstream, options.downstream radius = options.radius outfile = options.stdout regions = [] #################################################################### # define basal regions for each gene # take all basal regions per transcript and merge them # Thus, the basal region of a gene might be larger than the sum # of options.upstream + options.downstream for gene in gene_iterator: counter.genes += 1 is_negative_strand = Genomics.IsNegativeStrand(gene[0][0].strand) lcontig = fasta.getLength(gene[0][0].contig) regulons = [] transcript_ids = [] # collect every basal region per transcript for transcript in gene: counter.transcripts += 1 mi, ma = min([x.start for x in transcript ]), max([x.end for x in transcript]) # add range to both sides of tss if is_negative_strand: interval = ma - options.downstream, ma + options.upstream else: interval = mi - options.upstream, mi + options.downstream interval = (min(lcontig, max(0, interval[0])), min(lcontig, max(0, interval[1]))) regulons.append(interval) transcript_ids.append(transcript[0].transcript_id) # take first/last entry start, end = min(x[0] for x in regulons), max(x[1] for x in regulons) gtf = GTF.Entry() gtf.fromGTF(gene[0][0], gene[0][0].gene_id, gene[0][0].gene_id) gtf.source = "greatdomain" gtf.start, gtf.end = start, end regions.append(gtf) regions.sort(key=lambda x: (x.contig, x.start)) outf = iotools.open_file("test.gff", "w") for x in regions: outf.write(str(x) + "\n") outf.close() #################################################################### # extend basal regions regions.sort(key=lambda x: (x.contig, x.start)) # iterate within groups of overlapping basal regions groups = list(GTF.iterator_overlaps(iter(regions))) counter.groups = len(groups) last_end = 0 reset = False for region_id, group in enumerate(groups): # collect basal intervals in group intervals = [(x.start, x.end) for x in group] def overlapsBasalRegion(pos): for start, end in intervals: if start == pos or end == pos: continue if start <= pos < end: return True if start > pos: return False return False # deal with boundary cases - end of contig if region_id < len(groups) - 1: nxt = groups[region_id + 1] if nxt[0].contig == group[0].contig: next_start = min([x.start for x in nxt]) else: next_start = fasta.getLength(group[0].contig) reset = True else: next_start = fasta.getLength(group[0].contig) reset = True # last_end = basal extension of previous group # next_start = basal_extension of next group # extend region to previous/next group always extend # dowstream, but upstream only extend if basal region of an # interval is not already overlapping another basal region # within the group save_end = 0 for gtf in group: save_end = max(save_end, gtf.end) if gtf.strand == "+": if not overlapsBasalRegion(gtf.start): gtf.start = max(gtf.start - radius, last_end) # always extend downstream gtf.end = min(gtf.end + radius, next_start) else: # always extend downstream gtf.start = max(gtf.start - radius, last_end) if not overlapsBasalRegion(gtf.end): gtf.end = min(gtf.end + radius, next_start) outfile.write(str(gtf) + "\n") counter.regulons += 1 if len(group) > 1: counter.overlaps += len(group) else: counter.nonoverlaps += 1 if reset: last_end = 0 reset = False else: last_end = save_end E.info("%s" % str(counter))
def annotateGenes(iterator, fasta, options): """annotate gene structures This method outputs intervals for first/middle/last exon/intron, UTRs and flanking regions. This method annotates per transcript. In order to achieve a unique tiling, use only a single transcript per gene and remove any overlap between genes. """ gene_iterator = GTF.gene_iterator(iterator) ngenes, ntranscripts, nskipped = 0, 0, 0 results = [] increment = options.increment introns_detail = "introns" in options.detail exons_detail = "exons" in options.detail for gene in gene_iterator: ngenes += 1 is_negative_strand = Genomics.IsNegativeStrand(gene[0][0].strand) try: lcontig = fasta.getLength(gene[0][0].contig) except KeyError: nskipped += 1 continue results = [] for transcript in gene: def _add(interval, anno): gtf = GTF.Entry() gtf.contig = transcript[0].contig gtf.gene_id = transcript[0].gene_id gtf.transcript_id = transcript[0].transcript_id gtf.strand = transcript[0].strand gtf.feature = anno gtf.start, gtf.end = interval results.append(gtf) ntranscripts += 1 exons = [(x.start, x.end) for x in transcript if x.feature == "exon"] if len(exons) == 0: nskipped += 1 exons.sort() introns = [] end = exons[0][1] for exon in exons[1:]: introns.append((end, exon[0])) end = exon[1] # add flank start, end = exons[0][0], exons[-1][1] upstream, downstream = [], [] for x in range(0, options.flank, increment): upstream.append((start - increment, start)) start -= increment downstream.append((end, end + increment)) end += increment # remove out-of-bounds coordinates upstream = [x for x in upstream if x[0] >= 0] downstream = [x for x in downstream if x[1] <= lcontig] if is_negative_strand: exons.reverse() introns.reverse() upstream, downstream = downstream, upstream # add exons if exons_detail: _add(exons[0], "first_exon") if len(exons) > 1: _add(exons[-1], "last_exon") for e in exons[1:-1]: _add(e, "middle_exon") else: for e in exons: _add(e, "exon") # add introns if introns_detail: if len(introns) > 0: _add(introns[0], "first_intron") if len(introns) > 1: _add(introns[-1], "last_intron") for i in introns[1:-1]: _add(i, "middle_intron") else: for i in introns: _add(i, "intron") for x, u in enumerate(upstream): _add(u, "upstream_%i" % (increment * (x + 1))) for x, u in enumerate(downstream): _add(u, "downstream_%i" % (increment * (x + 1))) results.sort(key=lambda x: x.feature) cache = [] for key, vals in itertools.groupby(results, key=lambda x: x.feature): v = list(vals) intervals = [(x.start, x.end) for x in v] intervals = Intervals.combine(intervals) for start, end in intervals: r = GTF.Entry() r.copy(v[0]) r.start, r.end = start, end cache.append(r) cache.sort(key=lambda x: x.start) for r in cache: options.stdout.write("%s\n" % str(r)) E.info("ngenes=%i, ntranscripts=%i, nskipped=%i\n" % (ngenes, ntranscripts, nskipped))
def annotateRegulons(iterator, fasta, tss, options): """annotate regulons within iterator. Entries specied with ``--restrict-source`` are annotated. """ gene_iterator = GTF.gene_iterator(iterator) ngenes, ntranscripts, nregulons = 0, 0, 0 upstream, downstream = options.upstream, options.downstream for gene in gene_iterator: ngenes += 1 is_negative_strand = Genomics.IsNegativeStrand(gene[0][0].strand) lcontig = fasta.getLength(gene[0][0].contig) regulons = [] transcript_ids = [] for transcript in gene: ntranscripts += 1 mi, ma = min([x.start for x in transcript ]), max([x.end for x in transcript]) if tss: # add range to both sides of tss if is_negative_strand: interval = ma - options.downstream, ma + options.upstream else: interval = mi - options.upstream, mi + options.downstream else: # add range to both sides of tts if is_negative_strand: interval = mi - options.downstream, mi + options.upstream else: interval = ma - options.upstream, ma + options.downstream interval = (min(lcontig, max(0, interval[0])), min(lcontig, max(0, interval[1]))) regulons.append(interval) transcript_ids.append(transcript[0].transcript_id) if options.merge_promotors: # merge the regulons (and rename - as sort order might have # changed) regulons = Intervals.combine(regulons) transcript_ids = ["%i" % (x + 1) for x in range(len(regulons))] gtf = GTF.Entry() gtf.fromGTF(gene[0][0], gene[0][0].gene_id, gene[0][0].gene_id) gtf.source = "regulon" x = 0 for start, end in regulons: gtf.start, gtf.end = start, end gtf.transcript_id = transcript_ids[x] options.stdout.write("%s\n" % str(gtf)) nregulons += 1 x += 1 E.info("ngenes=%i, ntranscripts=%i, nregulons=%i" % (ngenes, ntranscripts, nregulons))