def _adjust_cds_start(self, cds_interval, expected_frame, frame): """adjust cds_interval to match the expected frame. It is possible for the cds_interval to become zero""" amt = 0 # this could be calculated rather than increment by in a loop, this is safer # for the feeble minded while frame != expected_frame: frame = self._frame_incr(frame) amt += 1 # min/max here avoids going negative, making a zero-length block if cds_interval.strand == '+': start = min(cds_interval.start + amt, cds_interval.stop) stop = cds_interval.stop gap_start = cds_interval.start gap_stop = cds_interval.start + amt else: start = cds_interval.start stop = max(cds_interval.stop - amt, cds_interval.start) gap_start = cds_interval.stop - amt gap_stop = cds_interval.stop cds_interval = ChromosomeInterval(cds_interval.chromosome, start, stop, cds_interval.strand) gap_interval = ChromosomeInterval(cds_interval.chromosome, gap_start, gap_stop, cds_interval.strand, 'gap') return cds_interval, gap_interval
def construct_start_stop_intervals(intron_intervals, d): """Splits a iterable of intervals into two parallel tuples of 2d bp intervals representing their start and stop""" left_intervals = [] right_intervals = [] for i in intron_intervals: left_intervals.append( ChromosomeInterval(i.chromosome, i.start - d, i.start + d, i.strand)) right_intervals.append( ChromosomeInterval(i.chromosome, i.stop - d, i.stop + d, i.strand)) return tuple(left_intervals), tuple(right_intervals)
def chromosome_coordinate_to_mrna(self, coord): if not (self.start <= coord < self.stop): return None p = 0 i = ChromosomeInterval(self.chromosome, coord, coord + 1, self.strand) if not any(i.overlap(x) for x in self.exon_intervals): return None exon_intervals = self.exon_intervals if self.strand == '+' else reversed(self.exon_intervals) for e in exon_intervals: if i.overlap(e): if self.strand == '+': p += coord - e.start else: p += e.stop - coord - 1 break p += len(e) return p
def chromosome_coordinate_to_mrna(self, coord): if not (self.start <= coord < self.stop): return None p = 0 i = ChromosomeInterval(self.chromosome, coord, coord + 1, self.strand) if not any(i.overlap(x) for x in self.exon_intervals): return None exon_intervals = self.exon_intervals if self.strand == '+' else reversed( self.exon_intervals) for e in exon_intervals: if i.overlap(e): if self.strand == '+': p += coord - e.start else: p += e.stop - coord - 1 break p += len(e) return p
def _get_intron_intervals(self): """ Builds a list of ChromosomeInterval objects representing the introns of this transcript. :return: List of ChromosomeIntervals """ intron_intervals = [] for i in xrange(1, len(self.block_starts)): stop = self.start + self.block_starts[i] start = self.start + self.block_starts[i - 1] + self.block_sizes[i - 1] intron_intervals.append( ChromosomeInterval(self.chromosome, start, stop, self.strand)) return intron_intervals
def _get_exon_intervals(self): """ Builds a list of ChromosomeInterval objects representing the exons of this transcript. :return: List of ChromosomeIntervals """ exon_intervals = [] for block_size, block_start in izip(*(self.block_sizes, self.block_starts)): start = self.start + block_start stop = self.start + block_start + block_size exon_intervals.append( ChromosomeInterval(self.chromosome, start, stop, self.strand)) return exon_intervals
def _get_exon_intervals(self): """ Overrides _get_exon_intervals to attach frame information to the intervals :return: List of ChromosomeIntervals """ exon_intervals = [] for block_size, block_start, frame in izip(*(self.block_sizes, self.block_starts, self.exon_frames)): start = self.start + block_start stop = self.start + block_start + block_size exon_intervals.append( ChromosomeInterval(self.chromosome, start, stop, self.strand, data={'frame': frame})) return exon_intervals
def get_start_intervals(self): """ Returns one or more ChromosomeInterval objects that represents the starting CDS interval for this transcript. More than one may exist if the codon is split over a splice junction. """ assert self.cds_size >= 3 positions = sorted( [self.cds_coordinate_to_chromosome(x) for x in range(3)]) merged_intervals = list(find_intervals(positions)) intervals = [ ChromosomeInterval(self.chromosome, i[0], i[-1] + 1, self.strand) for i in merged_intervals ] assert sum(len(x) for x in intervals) == 3 c = 0 for i in intervals: i.data = convert_frame(c) c += len(i) return intervals
def _get_coding_interval(self): """ Returns a ChromosomeInterval object representing the coding span of this transcript. """ return ChromosomeInterval(self.chromosome, self.thick_start, self.thick_stop, self.strand)
def get_gene_pred(self, name=None, new_start=None, new_stop=None, name2=None, score=None): """ Returns this transcript as a genePred transcript. If new_start or new_stop are set (chromosome coordinates), then this record will be changed to only show results within that region, which is defined in chromosome coordinates. The frames field will be properly adjusted, and the cds_start_stat/cds_end_stat fields will change to 'unk' if they are moved TODO: If this is a transMap transcript, and there were coding indels, the frame information will change to reflect the new arrangement and the implicit indel information will be lost. """ name = self.name if name is None else name name2 = self.name2 if name2 is None else name2 score = self.score if score is None else score # if no resizing, just return what we have if new_start is None and new_stop is None: exon_starts = ','.join( map(str, [exon.start for exon in self.exon_intervals])) exon_ends = ','.join( map(str, [exon.stop for exon in self.exon_intervals])) exon_frames = ','.join(map(str, self.exon_frames)) return map(str, [ name, self.chromosome, self.strand, self.start, self.stop, self.thick_start, self.thick_stop, len(self.exon_intervals), exon_starts, exon_ends, score, name2, self.cds_start_stat, self.cds_end_stat, exon_frames ]) if new_start is not None and new_stop is not None: assert new_start <= new_stop if new_start is not None: assert new_start >= self.start else: new_start = self.start if new_stop is not None: assert new_stop <= self.stop else: new_stop = self.stop # start slicing out intervals, adjusting the frames new_interval = ChromosomeInterval(self.chromosome, new_start, new_stop, self.strand) exon_intervals = [] exon_frames = [] exon_iter = self.exon_intervals if self.strand == '+' else self.exon_intervals[:: -1] frame_iter = self.exon_frames if self.strand == '+' else reversed( self.exon_frames) # attempt to find the first frame. If there is none, then we have a non-coding transcript and this is easy try: starting_frame = [f for f in frame_iter if f != -1][0] except IndexError: # non-coding transcript exon_intervals = [ exon.intersection(new_interval) for exon in exon_iter ] exon_frames = [-1] * len(exon_intervals) else: # start following frame to adjust for resized transcript cds_counter = 0 # keep track of total CDS bases encountered cds_flag = False for exon in exon_iter: new_exon = exon.intersection(new_interval) if new_exon is None: continue exon_intervals.append(new_exon) coding_exon = exon.intersection(self.coding_interval) if coding_exon is None: exon_frames.append(-1) elif cds_flag is False: cds_flag = True exon_frames.append(starting_frame) cds_counter += len(coding_exon) + starting_frame else: exon_frames.append(cds_counter % 3) cds_counter += len(coding_exon) # flip back around negative strand transcripts if self.strand == '-': exon_intervals = exon_intervals[::-1] exon_frames = exon_frames[::-1] # if new_start or new_stop were intronic coordinates, fix this if new_start != exon_intervals[0].start: new_start = exon_intervals[0].start if new_stop != exon_intervals[-1].stop: new_stop = exon_intervals[-1].stop thick_start = max(self.thick_start, new_start) thick_stop = min(self.thick_stop, new_stop) cds_start_stat = 'unk' if thick_start != self.thick_start else self.cds_start_stat cds_end_stat = 'unk' if thick_stop != self.thick_stop else self.cds_end_stat exon_count = len(exon_intervals) exon_starts = ','.join( map(str, [exon.start for exon in exon_intervals])) exon_ends = ','.join(map(str, [exon.stop for exon in exon_intervals])) exon_frames = ','.join(map(str, exon_frames)) return map(str, [ name, self.chromosome, self.strand, new_start, new_stop, thick_start, thick_stop, exon_count, exon_starts, exon_ends, score, name2, cds_start_stat, cds_end_stat, exon_frames ])
def get_bed(self, rgb=None, name=None, new_start=None, new_stop=None): """ Returns BED tokens for this object. Can be sliced into sub regions. :param rgb: Set this to modify the RGB field. :param name: Set this to modify the name field. :param new_start: Set this (in chromosome coordinates) to move the start. :param new_stop: Set this (in chromosome coordinates) to move the stop. :return: List of values representing a BED entry. """ if new_start is not None and new_stop is not None: assert new_start <= new_stop if new_start is not None: assert new_start >= self.start else: new_start = self.start if new_stop is not None: assert new_stop <= self.stop else: new_stop = self.stop rgb = self.rgb if rgb is None else rgb name = self.name if name is None else name # special case -- start == stop if new_start == new_stop: if self.cds_size == 0: thick_start = thick_stop = 0 else: thick_start = new_start thick_stop = new_stop return map(str, [ self.chromosome, new_start, new_stop, name, self.score, self.strand, thick_start, thick_stop, rgb, 1, 0, 0 ]) if self.chromosome_coordinate_to_mrna(new_start) is None: new_start = find_closest([x.start for x in self.exon_intervals], new_start) if self.chromosome_coordinate_to_mrna(new_stop) is None: new_stop = find_closest([x.stop for x in self.exon_intervals], new_stop) # start slicing out intervals new_interval = ChromosomeInterval(self.chromosome, new_start, new_stop, self.strand) exon_intervals = [] for exon in self.exon_intervals: new_exon = exon.intersection(new_interval) if new_exon is None: continue exon_intervals.append(new_exon) # if new_start or new_stop were not within the exonic intervals, adjust them if new_start != exon_intervals[0].start: new_start = exon_intervals[0].start if new_stop != exon_intervals[-1].stop: new_stop = exon_intervals[-1].stop thick_start = max(self.thick_start, new_start) thick_stop = min(self.thick_stop, new_stop) if thick_start >= self.thick_stop or thick_stop < self.thick_start: thick_start = 0 thick_stop = 0 block_count = len(exon_intervals) block_sizes = ','.join(map(str, [len(x) for x in exon_intervals])) block_starts = ','.join( map(str, [x.start - new_start for x in exon_intervals])) return map(str, [ self.chromosome, new_start, new_stop, name, self.score, self.strand, thick_start, thick_stop, rgb, block_count, block_sizes, block_starts ])