Beispiel #1
0
 def _adjust_cds_start(self, cds_interval, expected_frame, frame):
     """adjust cds_interval to match the expected frame.  It is possible
     for the cds_interval to become zero"""
     amt = 0
     # this could be calculated rather than increment by in a loop,  this is safer
     # for the feeble minded
     while frame != expected_frame:
         frame = self._frame_incr(frame)
         amt += 1
     # min/max here avoids going negative, making a zero-length block
     if cds_interval.strand == '+':
         start = min(cds_interval.start + amt, cds_interval.stop)
         stop = cds_interval.stop
         gap_start = cds_interval.start
         gap_stop = cds_interval.start + amt
     else:
         start = cds_interval.start
         stop = max(cds_interval.stop - amt, cds_interval.start)
         gap_start = cds_interval.stop - amt
         gap_stop = cds_interval.stop
     cds_interval = ChromosomeInterval(cds_interval.chromosome, start, stop,
                                       cds_interval.strand)
     gap_interval = ChromosomeInterval(cds_interval.chromosome, gap_start,
                                       gap_stop, cds_interval.strand, 'gap')
     return cds_interval, gap_interval
Beispiel #2
0
def construct_start_stop_intervals(intron_intervals, d):
    """Splits a iterable of intervals into two parallel tuples of 2d bp intervals representing their start and stop"""
    left_intervals = []
    right_intervals = []
    for i in intron_intervals:
        left_intervals.append(
            ChromosomeInterval(i.chromosome, i.start - d, i.start + d,
                               i.strand))
        right_intervals.append(
            ChromosomeInterval(i.chromosome, i.stop - d, i.stop + d, i.strand))
    return tuple(left_intervals), tuple(right_intervals)
 def chromosome_coordinate_to_mrna(self, coord):
     if not (self.start <= coord < self.stop):
         return None
     p = 0
     i = ChromosomeInterval(self.chromosome, coord, coord + 1, self.strand)
     if not any(i.overlap(x) for x in self.exon_intervals):
         return None
     exon_intervals = self.exon_intervals if self.strand == '+' else reversed(self.exon_intervals)
     for e in exon_intervals:
         if i.overlap(e):
             if self.strand == '+':
                 p += coord - e.start
             else:
                 p += e.stop - coord - 1
             break
         p += len(e)
     return p
Beispiel #4
0
 def chromosome_coordinate_to_mrna(self, coord):
     if not (self.start <= coord < self.stop):
         return None
     p = 0
     i = ChromosomeInterval(self.chromosome, coord, coord + 1, self.strand)
     if not any(i.overlap(x) for x in self.exon_intervals):
         return None
     exon_intervals = self.exon_intervals if self.strand == '+' else reversed(
         self.exon_intervals)
     for e in exon_intervals:
         if i.overlap(e):
             if self.strand == '+':
                 p += coord - e.start
             else:
                 p += e.stop - coord - 1
             break
         p += len(e)
     return p
Beispiel #5
0
 def _get_intron_intervals(self):
     """
     Builds a list of ChromosomeInterval objects representing the introns of this transcript.
     :return: List of ChromosomeIntervals
     """
     intron_intervals = []
     for i in xrange(1, len(self.block_starts)):
         stop = self.start + self.block_starts[i]
         start = self.start + self.block_starts[i -
                                                1] + self.block_sizes[i - 1]
         intron_intervals.append(
             ChromosomeInterval(self.chromosome, start, stop, self.strand))
     return intron_intervals
Beispiel #6
0
 def _get_exon_intervals(self):
     """
     Builds a list of ChromosomeInterval objects representing the exons of this transcript.
     :return: List of ChromosomeIntervals
     """
     exon_intervals = []
     for block_size, block_start in izip(*(self.block_sizes,
                                           self.block_starts)):
         start = self.start + block_start
         stop = self.start + block_start + block_size
         exon_intervals.append(
             ChromosomeInterval(self.chromosome, start, stop, self.strand))
     return exon_intervals
Beispiel #7
0
 def _get_exon_intervals(self):
     """
     Overrides _get_exon_intervals to attach frame information to the intervals
     :return: List of ChromosomeIntervals
     """
     exon_intervals = []
     for block_size, block_start, frame in izip(*(self.block_sizes,
                                                  self.block_starts,
                                                  self.exon_frames)):
         start = self.start + block_start
         stop = self.start + block_start + block_size
         exon_intervals.append(
             ChromosomeInterval(self.chromosome,
                                start,
                                stop,
                                self.strand,
                                data={'frame': frame}))
     return exon_intervals
Beispiel #8
0
 def get_start_intervals(self):
     """
     Returns one or more ChromosomeInterval objects that represents the starting CDS interval for this transcript.
     More than one may exist if the codon is split over a splice junction.
     """
     assert self.cds_size >= 3
     positions = sorted(
         [self.cds_coordinate_to_chromosome(x) for x in range(3)])
     merged_intervals = list(find_intervals(positions))
     intervals = [
         ChromosomeInterval(self.chromosome, i[0], i[-1] + 1, self.strand)
         for i in merged_intervals
     ]
     assert sum(len(x) for x in intervals) == 3
     c = 0
     for i in intervals:
         i.data = convert_frame(c)
         c += len(i)
     return intervals
Beispiel #9
0
 def _get_coding_interval(self):
     """
     Returns a ChromosomeInterval object representing the coding span of this transcript.
     """
     return ChromosomeInterval(self.chromosome, self.thick_start,
                               self.thick_stop, self.strand)
Beispiel #10
0
    def get_gene_pred(self,
                      name=None,
                      new_start=None,
                      new_stop=None,
                      name2=None,
                      score=None):
        """
        Returns this transcript as a genePred transcript.
        If new_start or new_stop are set (chromosome coordinates), then this record will be changed to only
        show results within that region, which is defined in chromosome coordinates. The frames field will be properly
        adjusted, and the cds_start_stat/cds_end_stat fields will change to 'unk' if they are moved

        TODO: If this is a transMap transcript, and there were coding indels, the frame information will change to
        reflect the new arrangement and the implicit indel information will be lost.
        """
        name = self.name if name is None else name
        name2 = self.name2 if name2 is None else name2
        score = self.score if score is None else score

        # if no resizing, just return what we have
        if new_start is None and new_stop is None:
            exon_starts = ','.join(
                map(str, [exon.start for exon in self.exon_intervals]))
            exon_ends = ','.join(
                map(str, [exon.stop for exon in self.exon_intervals]))
            exon_frames = ','.join(map(str, self.exon_frames))
            return map(str, [
                name, self.chromosome, self.strand, self.start, self.stop,
                self.thick_start, self.thick_stop,
                len(self.exon_intervals), exon_starts, exon_ends, score, name2,
                self.cds_start_stat, self.cds_end_stat, exon_frames
            ])
        if new_start is not None and new_stop is not None:
            assert new_start <= new_stop
        if new_start is not None:
            assert new_start >= self.start
        else:
            new_start = self.start
        if new_stop is not None:
            assert new_stop <= self.stop
        else:
            new_stop = self.stop

        # start slicing out intervals, adjusting the frames
        new_interval = ChromosomeInterval(self.chromosome, new_start, new_stop,
                                          self.strand)
        exon_intervals = []
        exon_frames = []
        exon_iter = self.exon_intervals if self.strand == '+' else self.exon_intervals[::
                                                                                       -1]
        frame_iter = self.exon_frames if self.strand == '+' else reversed(
            self.exon_frames)

        # attempt to find the first frame. If there is none, then we have a non-coding transcript and this is easy
        try:
            starting_frame = [f for f in frame_iter if f != -1][0]
        except IndexError:  # non-coding transcript
            exon_intervals = [
                exon.intersection(new_interval) for exon in exon_iter
            ]
            exon_frames = [-1] * len(exon_intervals)
        else:  # start following frame to adjust for resized transcript
            cds_counter = 0  # keep track of total CDS bases encountered
            cds_flag = False
            for exon in exon_iter:
                new_exon = exon.intersection(new_interval)
                if new_exon is None:
                    continue
                exon_intervals.append(new_exon)
                coding_exon = exon.intersection(self.coding_interval)
                if coding_exon is None:
                    exon_frames.append(-1)
                elif cds_flag is False:
                    cds_flag = True
                    exon_frames.append(starting_frame)
                    cds_counter += len(coding_exon) + starting_frame
                else:
                    exon_frames.append(cds_counter % 3)
                    cds_counter += len(coding_exon)

        # flip back around negative strand transcripts
        if self.strand == '-':
            exon_intervals = exon_intervals[::-1]
            exon_frames = exon_frames[::-1]

        # if new_start or new_stop were intronic coordinates, fix this
        if new_start != exon_intervals[0].start:
            new_start = exon_intervals[0].start
        if new_stop != exon_intervals[-1].stop:
            new_stop = exon_intervals[-1].stop

        thick_start = max(self.thick_start, new_start)
        thick_stop = min(self.thick_stop, new_stop)
        cds_start_stat = 'unk' if thick_start != self.thick_start else self.cds_start_stat
        cds_end_stat = 'unk' if thick_stop != self.thick_stop else self.cds_end_stat
        exon_count = len(exon_intervals)
        exon_starts = ','.join(
            map(str, [exon.start for exon in exon_intervals]))
        exon_ends = ','.join(map(str, [exon.stop for exon in exon_intervals]))
        exon_frames = ','.join(map(str, exon_frames))
        return map(str, [
            name, self.chromosome, self.strand, new_start, new_stop,
            thick_start, thick_stop, exon_count, exon_starts, exon_ends, score,
            name2, cds_start_stat, cds_end_stat, exon_frames
        ])
Beispiel #11
0
    def get_bed(self, rgb=None, name=None, new_start=None, new_stop=None):
        """
        Returns BED tokens for this object. Can be sliced into sub regions.
        :param rgb: Set this to modify the RGB field.
        :param name: Set this to modify the name field.
        :param new_start: Set this (in chromosome coordinates) to move the start.
        :param new_stop: Set this (in chromosome coordinates) to move the stop.
        :return: List of values representing a BED entry.
        """
        if new_start is not None and new_stop is not None:
            assert new_start <= new_stop
        if new_start is not None:
            assert new_start >= self.start
        else:
            new_start = self.start
        if new_stop is not None:
            assert new_stop <= self.stop
        else:
            new_stop = self.stop
        rgb = self.rgb if rgb is None else rgb
        name = self.name if name is None else name

        # special case -- start == stop
        if new_start == new_stop:
            if self.cds_size == 0:
                thick_start = thick_stop = 0
            else:
                thick_start = new_start
                thick_stop = new_stop
            return map(str, [
                self.chromosome, new_start, new_stop, name, self.score,
                self.strand, thick_start, thick_stop, rgb, 1, 0, 0
            ])

        if self.chromosome_coordinate_to_mrna(new_start) is None:
            new_start = find_closest([x.start for x in self.exon_intervals],
                                     new_start)
        if self.chromosome_coordinate_to_mrna(new_stop) is None:
            new_stop = find_closest([x.stop for x in self.exon_intervals],
                                    new_stop)

        # start slicing out intervals
        new_interval = ChromosomeInterval(self.chromosome, new_start, new_stop,
                                          self.strand)
        exon_intervals = []
        for exon in self.exon_intervals:
            new_exon = exon.intersection(new_interval)
            if new_exon is None:
                continue
            exon_intervals.append(new_exon)

        # if new_start or new_stop were not within the exonic intervals, adjust them
        if new_start != exon_intervals[0].start:
            new_start = exon_intervals[0].start
        if new_stop != exon_intervals[-1].stop:
            new_stop = exon_intervals[-1].stop
        thick_start = max(self.thick_start, new_start)
        thick_stop = min(self.thick_stop, new_stop)
        if thick_start >= self.thick_stop or thick_stop < self.thick_start:
            thick_start = 0
            thick_stop = 0
        block_count = len(exon_intervals)
        block_sizes = ','.join(map(str, [len(x) for x in exon_intervals]))
        block_starts = ','.join(
            map(str, [x.start - new_start for x in exon_intervals]))
        return map(str, [
            self.chromosome, new_start, new_stop, name, self.score,
            self.strand, thick_start, thick_stop, rgb, block_count,
            block_sizes, block_starts
        ])