def _iterator(iterator): """yield gene plus the locations of the end of the previous gene and start of next gene""" last_end, prev_end = 0, 0 last_contig = None last = None for matches in GTF.iterator_overlaps(iterator): this_start = min([x.start for x in matches]) this_end = max([x.end for x in matches]) if method == "tss": # restrict to tss if matches[0].strand == "+": this_end = this_start + 1 else: this_start = this_end - 1 this_contig = matches[0].contig if last_contig != this_contig: if last: yield prev_end, last, fasta.getLength(last_contig) last_end, prev_end = 0, 0 else: yield prev_end, last, this_start prev_end = last_end last_end = this_end last = matches last_contig = this_contig if last: yield prev_end, last, fasta.getLength(last_contig)
def annotateGREATDomains(iterator, fasta, options): """build great domains extend from TSS a basal region. """ gene_iterator = GTF.gene_iterator(iterator) counter = E.Counter() upstream, downstream = options.upstream, options.downstream radius = options.radius outfile = options.stdout regions = [] #################################################################### # define basal regions for each gene # take all basal regions per transcript and merge them # Thus, the basal region of a gene might be larger than the sum # of options.upstream + options.downstream for gene in gene_iterator: counter.genes += 1 is_negative_strand = Genomics.IsNegativeStrand(gene[0][0].strand) lcontig = fasta.getLength(gene[0][0].contig) regulons = [] transcript_ids = [] # collect every basal region per transcript for transcript in gene: counter.transcripts += 1 mi, ma = min([x.start for x in transcript ]), max([x.end for x in transcript]) # add range to both sides of tss if is_negative_strand: interval = ma - options.downstream, ma + options.upstream else: interval = mi - options.upstream, mi + options.downstream interval = (min(lcontig, max(0, interval[0])), min(lcontig, max(0, interval[1]))) regulons.append(interval) transcript_ids.append(transcript[0].transcript_id) # take first/last entry start, end = min(x[0] for x in regulons), max(x[1] for x in regulons) gtf = GTF.Entry() gtf.fromGTF(gene[0][0], gene[0][0].gene_id, gene[0][0].gene_id) gtf.source = "greatdomain" gtf.start, gtf.end = start, end regions.append(gtf) regions.sort(key=lambda x: (x.contig, x.start)) outf = iotools.open_file("test.gff", "w") for x in regions: outf.write(str(x) + "\n") outf.close() #################################################################### # extend basal regions regions.sort(key=lambda x: (x.contig, x.start)) # iterate within groups of overlapping basal regions groups = list(GTF.iterator_overlaps(iter(regions))) counter.groups = len(groups) last_end = 0 reset = False for region_id, group in enumerate(groups): # collect basal intervals in group intervals = [(x.start, x.end) for x in group] def overlapsBasalRegion(pos): for start, end in intervals: if start == pos or end == pos: continue if start <= pos < end: return True if start > pos: return False return False # deal with boundary cases - end of contig if region_id < len(groups) - 1: nxt = groups[region_id + 1] if nxt[0].contig == group[0].contig: next_start = min([x.start for x in nxt]) else: next_start = fasta.getLength(group[0].contig) reset = True else: next_start = fasta.getLength(group[0].contig) reset = True # last_end = basal extension of previous group # next_start = basal_extension of next group # extend region to previous/next group always extend # dowstream, but upstream only extend if basal region of an # interval is not already overlapping another basal region # within the group save_end = 0 for gtf in group: save_end = max(save_end, gtf.end) if gtf.strand == "+": if not overlapsBasalRegion(gtf.start): gtf.start = max(gtf.start - radius, last_end) # always extend downstream gtf.end = min(gtf.end + radius, next_start) else: # always extend downstream gtf.start = max(gtf.start - radius, last_end) if not overlapsBasalRegion(gtf.end): gtf.end = min(gtf.end + radius, next_start) outfile.write(str(gtf) + "\n") counter.regulons += 1 if len(group) > 1: counter.overlaps += len(group) else: counter.nonoverlaps += 1 if reset: last_end = 0 reset = False else: last_end = save_end E.info("%s" % str(counter))