Beispiel #1
0
 def __repr__(self):
     if self.name is None:
         return "ChromosomeInterval('{}', {}, {}, '{}')".format(self.chromosome, self.start, self.stop,
                                                                convert_strand(self.strand))
     else:
         return "ChromosomeInterval('{}', {}, {}, '{}', '{}')".format(self.chromosome, self.start, self.stop,
                                                                      convert_strand(self.strand), self.name)
Beispiel #2
0
 def __init__(self, gene_pred_tokens):
     # Text genePred fields
     self.name = gene_pred_tokens[0]
     self.chromosome = gene_pred_tokens[1]
     self.strand = convert_strand(gene_pred_tokens[2])
     # Integer genePred fields
     self.score = 0  # no score in genePred files
     self.thick_start = int(gene_pred_tokens[5])
     self.thick_stop = int(gene_pred_tokens[6])
     self.start = int(gene_pred_tokens[3])
     self.stop = int(gene_pred_tokens[4])
     self.rgb = "128,0,0"  # no RGB in genePred files
     # genePred specific fields
     self.id = gene_pred_tokens[10]
     self.name2 = gene_pred_tokens[11]
     self.cds_start_stat = gene_pred_tokens[12]
     self.cds_end_stat = gene_pred_tokens[13]
     self.exon_frames = [int(x) for x in gene_pred_tokens[14].split(",") if x != ""]
     # convert genePred format coordinates to BED-like coordinates to make intervals
     self.block_count = gene_pred_tokens[7]
     block_starts = [int(x) for x in gene_pred_tokens[8].split(",") if x != ""]
     block_ends = [int(x) for x in gene_pred_tokens[9].split(",") if x != ""]
     self.block_sizes = ",".join(map(str, [e - s for e, s in izip(block_ends, block_starts)]))
     self.block_starts = ",".join(map(str, [x - self.start for x in block_starts]))
     bed_tokens = [gene_pred_tokens[1], self.start, self.stop, self.name, self.score, gene_pred_tokens[2],
                   self.thick_start, self.thick_stop, self.rgb, self.block_count,
                   self.block_sizes, self.block_starts]
     # build chromosome intervals for exons and introns
     self.exon_intervals = self._get_exon_intervals(bed_tokens)
     self.intron_intervals = self._get_intron_intervals()
     # build Exons mapping transcript space coordinates to chromosome
     self.exons = self._get_exons(bed_tokens)
     # calculate sizes
     self._get_cds_size()
     self._get_size()
Beispiel #3
0
 def __init__(self, chromosome, start, stop, strand, name=None):
     self.chromosome = str(chromosome)
     assert start <= stop
     self.start = int(start)    # 0 based
     self.stop = int(stop)      # exclusive
     if strand not in [True, False, None]:
         strand = convert_strand(strand)
     self.strand = strand       # True or False
     self.name = name
Beispiel #4
0
def chromosome_region_to_bed(t, start, stop, rgb, name):
    """
    This is different from chromosome_coordinate_to_bed - this function will not resize the BED information
    for the input transcript, but instead be any coordinate on the chromosome.
    """
    strand = convert_strand(t.strand)
    chrom = t.chromosome
    assert start is not None and stop is not None, (t.name, start, stop, name)
    assert stop >= start, (t.name, start, stop, name)
    return [chrom, start, stop, name + "/" + t.name, 0, strand, start, stop, rgb, 1, stop - start, 0]
Beispiel #5
0
def splice_intron_interval_to_bed(t, intron_interval, rgb, name):
    """
    Specific case of turning an intron interval into the first and last two bases (splice sites)
    """
    interval = intron_interval
    assert interval.stop >= interval.start, (t.name, t.chromosome)
    assert interval.stop - interval.start - 2 > 2, (t.name, t.chromosome)
    block_starts = "0,{}".format(interval.stop - interval.start - 2)
    return [interval.chromosome, interval.start, interval.stop, "/".join([name, t.name]), 0,
            convert_strand(interval.strand), interval.start, interval.stop, rgb, 2, "2,2", block_starts]
Beispiel #6
0
    def _get_exon_intervals(self, bed_tokens):
        """
        Gets a list of exon intervals in chromosome coordinate space.
        These exons are on (+) strand ordering regardless of transcript strand.
        This means (-) strand genes will be represented backwards
        """
        exons = []
        start, stop = int(bed_tokens[1]), int(bed_tokens[2])
        chrom, strand = bed_tokens[0], convert_strand(bed_tokens[5])

        block_sizes = [int(x) for x in bed_tokens[10].split(",") if x != ""]
        block_starts = [int(x) for x in bed_tokens[11].split(",") if x != ""]

        for block_size, block_start in izip(block_sizes, block_starts):
            exons.append(ChromosomeInterval(chrom, start + block_start, start + block_start + block_size, strand))
        return exons
Beispiel #7
0
 def __init__(self, bed_tokens):
     self.chromosome = bed_tokens[0]
     self.start = int(bed_tokens[1])
     self.stop = int(bed_tokens[2])
     self.name = bed_tokens[3]
     self.score = int(bed_tokens[4])
     self.strand = convert_strand(bed_tokens[5])
     self.thick_start = int(bed_tokens[6])
     self.thick_stop = int(bed_tokens[7])
     self.rgb = bed_tokens[8]
     self.block_count = bed_tokens[9]
     self.block_sizes = bed_tokens[10]
     self.block_starts = bed_tokens[11]
     # build chromosome intervals for exons and introns
     self.exon_intervals = self._get_exon_intervals(bed_tokens)
     self.intron_intervals = self._get_intron_intervals()
     # build Exons mapping transcript space coordinates to chromosome
     self.exons = self._get_exons(bed_tokens)
     # calculate sizes
     self._get_cds_size()
     self._get_size()
Beispiel #8
0
 def get_bed(self, rgb, name):
     """
     Returns BED tokens representing this interval. Requires a name and a rgb value. BED is BED12.
     """
     return [self.chromosome, self.start, self.stop, name, 0, convert_strand(self.strand), self.start, self.stop,
             rgb, 1, len(self), 0]
Beispiel #9
0
def interval_to_bed(t, interval, rgb, name):
    """
    If you are turning interval objects into BED records, look here. t is a transcript object.
    Interval objects should always have start <= stop (+ strand chromosome ordering)
    """
    assert interval.stop >= interval.start, (t.name, t.chromosome)
    return [interval.chromosome, interval.start, interval.stop, name + "/" + t.name, 0, convert_strand(interval.strand),
            interval.start, interval.stop, rgb, 1, interval.stop - interval.start, 0]
Beispiel #10
0
    def get_bed(self, rgb=None, name=None, start_offset=None, stop_offset=None):
        """
        Returns this transcript as a BED record with optional changes to rgb and name.
        If start_offset or stop_offset are set (chromosome coordinates), then this record will be changed to only
        show results within that region, which is defined in chromosome coordinates.
        """
        if start_offset is not None and stop_offset is not None:
            assert start_offset <= stop_offset
        if start_offset is not None:
            assert start_offset >= self.start
        if stop_offset is not None:
            assert stop_offset <= self.stop
        if rgb is None:
            rgb = self.rgb
        if name is not None:
            name += "/" + self.name
        else:
            name = self.name
        if start_offset is None and stop_offset is None:
            return [self.chromosome, self.start, self.stop, name, self.score, convert_strand(self.strand),
                    self.thick_start, self.thick_stop, rgb, self.block_count, self.block_sizes, self.block_starts]
        elif start_offset == stop_offset:
            assert self.chromosome_coordinate_to_transcript(start_offset) is not None   # no intron records
            return [self.chromosome, start_offset, stop_offset, name, self.score, convert_strand(self.strand),
                    start_offset, stop_offset, rgb, 1, 0, 0]

        def _move_start(exon_intervals, block_count, block_starts, block_sizes, start, start_offset):
            to_remove = len([x for x in exon_intervals if x.start <= start_offset and x.stop <= start_offset])
            assert to_remove < len(exon_intervals)
            if to_remove > 0:
                block_count -= to_remove
                block_sizes = block_sizes[to_remove:]
                start += block_starts[to_remove]
                new_block_starts = [0]
                for i in xrange(to_remove, len(block_starts) - 1):
                    new_block_starts.append(block_starts[i + 1] - block_starts[i] + new_block_starts[-1])
                block_starts = new_block_starts
            if start_offset > start:
                block_sizes[0] += start - start_offset
                block_starts[1:] = [x + start - start_offset for x in block_starts[1:]]
                start = start_offset
            return start, block_count, block_starts, block_sizes

        def _move_stop(exon_intervals, block_count, block_starts, block_sizes, stop, start, stop_offset):
            to_remove = len([x for x in exon_intervals if x.stop >= stop_offset and x.start >= stop_offset])
            assert to_remove < len(exon_intervals)
            if to_remove > 0:
                block_count -= to_remove
                block_sizes = block_sizes[:-to_remove]
                block_starts = block_starts[:-to_remove]
                assert len(block_sizes) == len(block_starts)
                if len(block_sizes) == 0:
                    block_sizes = block_starts = [0]
                    block_count = 1
                stop = start + block_sizes[-1] + block_starts[-1]
            if start + block_starts[-1] < stop_offset < stop:
                block_sizes[-1] = stop_offset - start - block_starts[-1]
                stop = stop_offset
            return stop, block_count, block_starts, block_sizes

        block_count = int(self.block_count)
        block_starts = map(int, self.block_starts.split(","))
        block_sizes = map(int, self.block_sizes.split(","))
        start = self.start
        stop = self.stop
        thick_start = self.thick_start
        thick_stop = self.thick_stop

        if start_offset is not None and start_offset > start:
            start, block_count, block_starts, block_sizes = _move_start(self.exon_intervals, block_count, block_starts,
                                                                    block_sizes, start, start_offset)
        if stop_offset is not None and stop_offset < stop:
            stop, block_count, block_starts, block_sizes = _move_stop(self.exon_intervals, block_count, block_starts,
                                                                  block_sizes, stop, start, stop_offset)
        if start > thick_start:
            thick_start = start
        if stop < thick_stop:
            thick_stop = stop
        if (start > thick_stop and stop > thick_stop) or (start < thick_start and stop < thick_start):
            thick_start = 0
            thick_stop = 0
        block_starts = ",".join(map(str, block_starts))
        block_sizes = ",".join(map(str, block_sizes))
        return [self.chromosome, start, stop, name, self.score, convert_strand(self.strand), thick_start, thick_stop, rgb,
                block_count, block_sizes, block_starts]
Beispiel #11
0
 def get_interval(self):
     """
     Returns a ChromosomeInterval object representing the full span of this transcript.
     """
     return ChromosomeInterval(self.chromosome, self.start, self.stop, convert_strand(self.strand))
Beispiel #12
0
    def _get_exons(self, bed_tokens):
        """
        Get a list of Exons representing the exons in transcript coordinate
        space. This is in transcript order. See the Exon class for more.
        """
        exons = []
        chrom_start, chrom_stop = int(bed_tokens[1]), int(bed_tokens[2])
        thick_start, thick_stop = int(bed_tokens[6]), int(bed_tokens[7])
        if thick_start == thick_stop:
            thick_start = thick_stop = 0
        chrom, strand = bed_tokens[0], convert_strand(bed_tokens[5])

        block_count = int(bed_tokens[9])
        block_sizes = [int(x) for x in bed_tokens[10].split(",") if x != ""]
        block_starts = [int(x) for x in bed_tokens[11].split(",") if x != ""]

        ##################################################################
        # HERE BE DRAGONS
        # this is seriously ugly code to maintain proper mapping
        # between coordinate spaces. See the unit tests.
        ##################################################################
        if strand is False:
            block_sizes = reversed(block_sizes)
            block_starts = reversed(block_starts)

        t_pos, cds_pos = 0, None
        for block_size, block_start in izip(block_sizes, block_starts):
            # calculate transcript relative coordinates
            this_start = t_pos
            this_stop = t_pos + block_size
            # calculate chromosome relative coordinates
            this_chrom_start = chrom_start + block_start
            this_chrom_stop = chrom_start + block_start + block_size
            # calculate transcript-relative CDS positions
            # cds_pos is pos of first coding base in CDS coordinates
            this_cds_start, this_cds_stop, this_cds_pos = None, None, None
            if strand is True:
                # special case - single exon
                if block_count == 1:
                    this_cds_pos = 0
                    this_cds_start = thick_start - this_chrom_start
                    this_cds_stop = thick_stop - this_chrom_start
                # special case - entirely non-coding
                elif thick_start == thick_stop == 0:
                    this_cds_start, this_cds_stop, this_cds_pos = None, None, None
                # special case - CDS starts and stops on the same exon
                elif (this_chrom_start <= thick_start < this_chrom_stop and this_chrom_start < thick_stop <=
                        this_chrom_stop):
                    this_cds_pos = 0
                    cds_pos = this_chrom_stop - thick_start
                    this_cds_start = this_start + thick_start - this_chrom_start
                    this_cds_stop = this_stop + thick_stop - this_chrom_stop
                # is this the start codon containing exon?
                elif this_chrom_start <= thick_start < this_chrom_stop:
                    cds_pos = this_chrom_stop - thick_start
                    this_cds_pos = 0
                    this_cds_start = this_start + thick_start - this_chrom_start
                # is this the stop codon containing exon?
                elif this_chrom_start < thick_stop <= this_chrom_stop:
                    this_cds_pos = cds_pos
                    cds_pos += thick_stop - this_chrom_start
                    this_cds_stop = this_stop + thick_stop - this_chrom_stop
                # is this exon all coding?
                elif (this_cds_stop is None and this_cds_start is None and thick_stop >=
                      this_chrom_stop and thick_start < this_chrom_start):
                    this_cds_pos = cds_pos
                    cds_pos += block_size
            else:
                # special case - single exon
                if block_count == 1:
                    this_cds_pos = 0
                    this_cds_start = this_chrom_stop - thick_stop
                    this_cds_stop = thick_stop - this_chrom_start + this_cds_start
                # special case - entirely non-coding
                elif thick_start == thick_stop == 0:
                    this_cds_start, this_cds_stop, this_cds_pos = None, None, None
                # special case - start and stop codons are on the same exon
                elif (this_chrom_start < thick_stop <= this_chrom_stop and this_chrom_start <= thick_start <
                      this_chrom_stop):
                    cds_pos = thick_stop - this_chrom_start
                    this_cds_pos = 0
                    this_cds_start = this_start + this_chrom_stop - thick_stop
                    this_cds_stop = this_start + this_chrom_stop - thick_start
                # is this the start codon containing exon?
                elif this_chrom_start < thick_stop <= this_chrom_stop:
                    cds_pos = thick_stop - this_chrom_start
                    this_cds_pos = 0
                    this_cds_start = this_start + this_chrom_stop - thick_stop
                # is this the stop codon containing exon?
                elif this_chrom_start <= thick_start < this_chrom_stop:
                    this_cds_pos = cds_pos
                    this_cds_stop = this_start + this_chrom_stop - thick_start
                # is this exon all coding?
                elif (this_cds_stop is None and this_cds_start is None and thick_stop >=
                      this_chrom_stop and thick_start < this_chrom_start):
                    this_cds_pos = cds_pos
                    cds_pos += block_size
            exons.append(Exon(this_start, this_stop, strand, this_chrom_start, this_chrom_stop, this_cds_start,
                              this_cds_stop, this_cds_pos))
            t_pos += block_size
        return exons