예제 #1
0
 def setUp(self):
     iv = IntervalNode(50, 59, Interval(50, 59))
     for i in range(0, 110, 10):
         if i == 50: continue
         f = Interval(i, i + 9)
         iv = iv.insert(f.start, f.end, f)
     self.intervals = iv
예제 #2
0
 def setUp(self):
     iv = IntervalTree()
     iv.add_interval(Interval(50, 59))
     for i in range(0, 110, 10):
         if i == 50: continue
         f = Interval(i, i + 9)
         iv.add_interval(f)
     self.intervals = iv
예제 #3
0
    def test_left(self):
        iv = self.intervals
        self.assertEqual(str(iv.left(60, n=2)),
                         str([Interval(50, 59),
                              Interval(40, 49)]))

        for i in range(10, 100, 10):
            r = iv.left(i, max_dist=10, n=1)
            self.assertEqual(r[0].end, i - 1)
예제 #4
0
    def setUp(self):
        iv = IntervalNode(1, 2, Interval(1, 2))
        self.max = 1000000
        for i in range(0, self.max, 10):
            f = Interval(i, i)
            iv = iv.insert(f.start, f.end, f)

        for i in range(600):
            iv = iv.insert(0, 1, Interval(0, 1))
        self.intervals = iv
예제 #5
0
    def test_downstream(self):
        iv = self.intervals
        downstreams = iv.downstream_of_interval(Interval(59, 60),
                                                num_intervals=200)
        for d in downstreams:
            self.assertTrue(d.start > 60)

        downstreams = iv.downstream_of_interval(Interval(59, 60, strand=-1),
                                                num_intervals=200)
        for d in downstreams:
            self.assertTrue(d.start < 59)
예제 #6
0
    def test_upstream(self):
        iv = self.intervals
        upstreams = iv.upstream_of_interval(Interval(59, 60),
                                            num_intervals=200)
        for u in upstreams:
            self.assertTrue(u.end < 59)

        upstreams = iv.upstream_of_interval(Interval(60, 70, strand=-1),
                                            num_intervals=200)
        for u in upstreams:
            self.assertTrue(u.start > 70)

        upstreams = iv.upstream_of_interval(Interval(58, 58, strand=-1),
                                            num_intervals=200)
        for u in upstreams:
            self.assertTrue(u.start > 59)
예제 #7
0
    def test_right(self):
        iv = self.intervals
        self.assertEqual(str(iv.left(60, n=2)),
                         str([Interval(50, 59),
                              Interval(40, 49)]))

        def get_right_start(b10):
            r = iv.right(b10 + 1, n=1)
            assert len(r) == 1
            return r[0].start

        for i in range(10, 100, 10):
            self.assertEqual(get_right_start(i), i + 10)

        for i in range(0, 100, 10):
            r = iv.right(i - 1, max_dist=10, n=1)
            self.assertEqual(r[0].start, i)
예제 #8
0
    def setUp(self):

        iv = IntervalTree()
        n = 0
        for i in range(1, 1000, 80):
            iv.insert(i, i + 10, dict(value=i * i))
            # add is synonym for insert.
            iv.add(i + 20, i + 30, dict(astr=str(i * i)))

            # or insert/add an interval object with start, end attrs.
            iv.insert_interval(
                Interval(i + 40, i + 50, value=dict(astr=str(i * i))))
            iv.add_interval(
                Interval(i + 60, i + 70, value=dict(astr=str(i * i))))

            n += 4
        self.intervals = self.iv = iv
        self.nintervals = n
예제 #9
0
    def test_n(self):
        iv = self.intervals
        for i in range(0, 90, 10):
            r = iv.after(i, max_dist=20, num_intervals=2)
            self.assertEqual(r[0].start, i + 10)
            self.assertEqual(r[1].start, i + 20)

            r = iv.after_interval(Interval(i, i), max_dist=20, num_intervals=2)
            self.assertEqual(r[0].start, i + 10)
            self.assertEqual(r[1].start, i + 20)
예제 #10
0
def _check_cdna_vs_utr(transcript):
    """
    Verify that cDNA + UTR in the transcript add up.
    :return:
    """

    transcript.logger.debug("Checking the cDNA for %s", transcript.id)
    if transcript.cdna_length > transcript.combined_utr_length + transcript.combined_cds_length:
        if transcript.combined_utr == transcript.combined_cds == []:
            # non-coding transcript
            transcript.logger.debug("%s is non coding, returning",
                                    transcript.id)
            return
        assert transcript.combined_cds != []

        transcript.logger.debug(
            "Recalculating the UTR for %s. Reason: cDNA length %s, UTR %s, CDS %s (total %s)",
            transcript.id, transcript.cdna_length,
            transcript.combined_utr_length, transcript.combined_cds_length,
            transcript.combined_utr_length + transcript.combined_cds_length)
        transcript.combined_utr = []  # Reset
        transcript.combined_cds = sorted(transcript.combined_cds,
                                         key=operator.itemgetter(0, 1))

        combined_cds = IntervalTree.from_tuples(transcript.combined_cds)
        orfs = [
            IntervalTree.from_tuples([_[1] for _ in orf if _[0] == "CDS"])
            for orf in transcript.internal_orfs
        ]
        assert isinstance(combined_cds, IntervalTree)

        exons = IntervalTree.from_intervals(
            [Interval(*exon) for exon in transcript.exons])

        mapper = defaultdict(list)
        for cds in transcript.combined_cds:
            fexon = exons.find(cds[0] - 1, cds[1], strict=False)
            if len(fexon) > 1:
                raise InvalidCDS(
                    "{} has a CDS ({}) which straddles {} different exons ({})."
                    .format(transcript.id, cds, len(fexon), fexon))
            elif len(fexon) == 0:
                raise InvalidCDS(
                    "{} has a CDS ({}) which is not mapped to any exon.".
                    format(transcript.id, cds, len(fexon), fexon))
            mapper[fexon[0]].append(cds)

        for exon in transcript.exons:
            if exon not in mapper:
                transcript.combined_utr.append(exon)
                continue
            elif len(mapper[exon]) == 1:
                cds = mapper[exon][0]
                if cds[0] == exon[0] and exon[1] == cds[1]:
                    continue
                else:
                    before = None
                    after = None
                    if cds[0] < exon[0] or cds[1] > exon[1]:
                        raise InvalidCDS("{} in {} debords its exon {}".format(
                            cds, transcript.id, exon))
                    if cds[0] > exon[0]:
                        before = (exon[0], max(cds[0] - 1, exon[0]))
                        transcript.combined_utr.append(before)
                    if cds[1] < exon[1]:
                        after = (min(cds[1] + 1, exon[1]), exon[1])
                        transcript.combined_utr.append(after)
                    assert before or after, (exon, cds)
            else:
                transcript.logger.debug("Starting to find the UTRs for %s",
                                        exon)
                found = sorted(mapper[exon])
                utrs = []
                for pos, interval in enumerate(found):
                    if pos == len(found) - 1:
                        if exon[1] > interval[1]:
                            utrs.append((min(exon[1],
                                             interval[1] + 1), exon[1]))
                        continue
                    if pos == 0 and exon[0] < interval[0]:
                        utrs.append((exon[0], max(exon[0], interval[0] - 1)))
                    next_interval = found[pos + 1]
                    if not (interval[1] + 1 <= next_interval[0] - 1):
                        raise InvalidCDS(
                            "Error while inferring the UTR for a transcript with multiple ORFs: overlapping CDS found."
                        )
                    utrs.append((interval[1] + 1, next_interval[0] - 1))
                assert utrs, found
                utr_sum = sum([_[1] - _[0] + 1 for _ in utrs])
                cds_sum = sum(_[1] - _[0] + 1 for _ in found)
                assert utr_sum + cds_sum == exon[1] - exon[0] + 1, (utr_sum,
                                                                    cds_sum,
                                                                    exon[1] -
                                                                    exon[0] +
                                                                    1, utrs,
                                                                    found)
                transcript.combined_utr.extend(utrs)

        # If no CDS and no UTR are present, all good
        equality_one = (transcript.combined_cds_length ==
                        transcript.combined_utr_length == 0)
        # Otherwise, if cDNA length == UTR + CDS, all good
        equality_two = (
            transcript.cdna_length == transcript.combined_utr_length +
            transcript.combined_cds_length)
        if not (equality_one or equality_two):
            # Something fishy going on
            raise InvalidCDS(""""Failed to create the UTR:
ID: {}
Exons: {}
Combined CDS: {}
Combined UTR: {}
CDS == UTR == 0: {}
CDNA == CDS + UTR: {}
CDNA == {}
CDS == {}
UTR == {}""".format(transcript.id, transcript.exons, transcript.combined_cds,
                    transcript.combined_utr, equality_one, equality_two,
                    transcript.cdna_length, transcript.combined_cds_length,
                    transcript.combined_utr_length))