Exemple #1
0
    def _get_remaining_known_ref_variants(known_ref_variants, used_ref_variants, nucmer_coords):
        '''Finds variants where ref has the variant and so does the contig. Which means
           that there was no mummer call to flag it up so need to look through the known
           ref variants. Also need to check that the variant is in a nucmer match to an
           assembly contig.'''
        variants = []

        for ref_variant_pos, ref_variants_set in sorted(known_ref_variants.items()):
            for known_ref_variant in ref_variants_set:
                if known_ref_variant not in used_ref_variants:
                    variant_pos_matches_contig = False
                    pos = known_ref_variant.variant.position

                    if known_ref_variant.seq_type == 'n':
                        ref_interval = intervals.Interval(pos, pos)
                    elif known_ref_variant.seq_type == 'p':
                        ref_interval = intervals.Interval(3 * pos, 3 * pos + 2)
                    else:
                        raise Error('Unexpected variant type "' + known_ref_variant.variant_type + '" in _get_remaining_known_ref_variants. Cannot continue')

                    for interval in nucmer_coords:
                        if ref_interval.intersects(interval):
                            variant_pos_matches_contig = True
                            break

                    if variant_pos_matches_contig:
                        variants.append((None, known_ref_variant.seq_type, None, None, None, {known_ref_variant}, set()))

        return variants
 def test_intersection(self):
     '''Intersection should either return None or the correct intersection'''
     a = intervals.Interval(5, 10)
     b = intervals.Interval(8, 15)
     c = intervals.Interval(12, 20)
     self.assertEqual(a.intersection(c), None)
     self.assertEqual(a.intersection(b), intervals.Interval(8, 10))
    def test_length_sum_from_list(self):
        '''Test that total length of intervals is summed correctly'''
        a = [
            intervals.Interval(1, 2),
            intervals.Interval(4, 5),
            intervals.Interval(10, 19)
        ]

        self.assertEqual(14, intervals.length_sum_from_list(a))
Exemple #4
0
    def __init__(self, contig1, start1, end1, contig2, start2, end2):
        self.names = [contig1, contig2]
        self.rev = {contig1: start1 > end1, contig2: start2 > end2}
        self.coords = {
            contig1: intervals.Interval(min(start1, end1), max(start1, end1)),
            contig2: intervals.Interval(min(start2, end2), max(start2, end2))
        }

        if self.names[0] > self.names[1]:
            self.reverse()
    def test_set_coords(self):
        '''Test set_coords'''
        self.trans.coords = None
        self.trans._set_coords()
        self.assertEqual(self.trans.coords, intervals.Interval(42, 100))

        self.trans.add_gff_record(self.gff_exon)
        self.trans.coords = None
        self.trans._set_coords()
        self.assertEqual(self.trans.coords, intervals.Interval(44, 53))
 def test_init(self):
     '''Throw error if try to construct genome_interval from a non-int, or end<start'''
     with self.assertRaises(intervals.Error):
         intervals.Interval('a', 1)
     with self.assertRaises(intervals.Error):
         intervals.Interval(1, 'a')
     with self.assertRaises(intervals.Error):
         intervals.Interval('a', 'a')
     with self.assertRaises(intervals.Error):
         intervals.Interval(3, 2)
 def test_can_extend_start(self):
     '''Test can_extend_start()'''
     self.trans.add_gff_record(self.gff_exon)
     trans2 = copy.deepcopy(self.trans)
     trans2.coords = intervals.Interval(42,44)
     self.assertTrue(self.trans.can_extend_start(trans2))
     self.assertTrue(self.trans.can_extend_start(trans2, min_extend=2))
     self.assertFalse(self.trans.can_extend_start(trans2, min_extend=3))
     self.assertFalse(self.trans.can_extend_start(trans2, min_extend=4))
     trans2.coords = intervals.Interval(44,50)
     self.assertFalse(self.trans.can_extend_start(trans2))
 def test_union_flll_gap(self):
     '''union_fill_gap() should ignore intersections and return the maximum range of coords'''
     a = intervals.Interval(5, 10)
     b = intervals.Interval(8, 15)
     c = intervals.Interval(12, 20)
     d = intervals.Interval(21, 22)
     self.assertEqual(a.union_fill_gap(c), intervals.Interval(5, 20))
     self.assertEqual(c.union_fill_gap(a), intervals.Interval(5, 20))
     self.assertEqual(a.union_fill_gap(b), intervals.Interval(5, 15))
     self.assertEqual(b.union_fill_gap(a), intervals.Interval(5, 15))
     self.assertEqual(c.union_fill_gap(d), intervals.Interval(12, 22))
     self.assertEqual(d.union_fill_gap(c), intervals.Interval(12, 22))
Exemple #9
0
def _orfs_from_aa_seq(seq):
    orfs = []
    pos = 0
    while pos < len(seq):
        next_stop = seq.find('*', pos)
        if next_stop == -1:
            orfs.append(intervals.Interval(pos, len(seq)-1))
            break
        elif next_stop > pos:
            orfs.append(intervals.Interval(pos, next_stop))
        pos = next_stop + 1
    return orfs
Exemple #10
0
    def test_change_hit_coords_with_intersection(self):
        '''test test_change_hit_coords_with_intersection'''
        e = edge.Edge('c1', 1, 42, 'c2', 10, 50)
        e._change_hit_coords_with_intersection('c1', intervals.Interval(20, 30))
        self.assertEqual(e, edge.Edge('c1', 20, 30, 'c2', 29, 38))

        e = edge.Edge('c1', 1, 42, 'c2', 10, 50)
        e._change_hit_coords_with_intersection('c2', intervals.Interval(20, 30))
        self.assertEqual(e, edge.Edge('c1', 11, 22, 'c2', 20, 30))

        e = edge.Edge('c1', 1, 42, 'c2', 50, 10)
        e._change_hit_coords_with_intersection('c1', intervals.Interval(20, 30))
        self.assertEqual(e, edge.Edge('c1', 20, 30, 'c2', 31, 22))

        e = edge.Edge('c1', 1, 42, 'c2', 50, 10)
        e._change_hit_coords_with_intersection('c2', intervals.Interval(20, 30))
        self.assertEqual(e, edge.Edge('c1', 21, 32, 'c2', 30, 20))

        e = edge.Edge('c1', 42, 1, 'c2', 10, 50)
        e._change_hit_coords_with_intersection('c1', intervals.Interval(20, 30))
        self.assertEqual(e, edge.Edge('c1', 30, 20, 'c2', 22, 31))

        e = edge.Edge('c1', 42, 1, 'c2', 10, 50)
        e._change_hit_coords_with_intersection('c2', intervals.Interval(20, 30))
        self.assertEqual(e, edge.Edge('c1', 32, 21, 'c2', 20, 30))

        e = edge.Edge('c1', 42, 1, 'c2', 50, 10)
        e._change_hit_coords_with_intersection('c1', intervals.Interval(20, 30))
        self.assertEqual(e, edge.Edge('c1', 30, 20, 'c2', 38, 29))

        e = edge.Edge('c1', 42, 1, 'c2', 50, 10)
        e._change_hit_coords_with_intersection('c2', intervals.Interval(20, 30))
        self.assertEqual(e, edge.Edge('c1', 22, 11, 'c2', 30, 20))
Exemple #11
0
    def orfs(self, frame=0, revcomp=False):
        '''Returns a list of ORFs that the sequence has, starting on the given
           frame. Each returned ORF is an interval.Interval object.
           If revomp=True, then finds the ORFs of the reverse complement
           of the sequence.'''
        assert frame in [0,1,2]
        if revcomp:
            self.revcomp()

        aa_seq = self.translate(frame=frame).seq.rstrip('X')
        if revcomp:
            self.revcomp()

        orfs = _orfs_from_aa_seq(aa_seq)
        for i in range(len(orfs)):
            if revcomp:
                start = len(self) - (orfs[i].end * 3 + 3) - frame
                end = len(self) - (orfs[i].start * 3) - 1 - frame
            else:
                start = orfs[i].start * 3 + frame
                end = orfs[i].end * 3 + 2 + frame

            orfs[i] = intervals.Interval(start, end)

        return orfs
Exemple #12
0
 def gaps(self, min_length = 1):
     '''Finds the positions of all gaps in the sequence that are at least min_length long. Returns a list of Intervals. Coords are zero-based'''
     gaps = []
     regex = re.compile('N+', re.IGNORECASE)
     for m in regex.finditer(self.seq):
          if m.span()[1] - m.span()[0] + 1 >= min_length:
              gaps.append(intervals.Interval(m.span()[0], m.span()[1] - 1))
     return gaps
 def might_extend(self, other, min_extend=1):
     coords = intervals.Interval(self.coords.start - min_extend + 1, self.coords.end + min_extend - 1)
     strands_ok = (self.strand == other.strand and self.strand not in ['.', 'Inconsistent']) \
                  or (self.strand in ['-', '+'] and other.strand == '.' and len(self.exons) == len(other.exons) == 1)
     return self.seqname == other.seqname \
          and strands_ok \
          and len(self.exons) * len(other.exons) != 0 \
          and (coords.intersects(other.coords) or other.coords.end + 1 == coords.start or coords.end + 1 == other.coords.start) \
          and (other.coords.start < coords.start or coords.end < other.coords.end)
Exemple #14
0
    def contig_coords(self):
        '''Finds coords of contigs, i.e. everything that's not a gap (N or n). Returns a list of Intervals. Coords are zero-based'''
        # contigs are the opposite of gaps, so work out the coords from the gap coords
        gaps = self.gaps()

        if len(gaps) == 0:
            return [intervals.Interval(0, len(self) - 1)]

        coords = [0]
        for g in gaps:
            if g.start == 0:
                coords = [g.end + 1]
            else:
                coords += [g.start - 1, g.end + 1]

        if coords[-1] < len(self):
            coords.append(len(self) - 1)

        return [intervals.Interval(coords[i], coords[i+1]) for i in range(0, len(coords)-1,2)]
Exemple #15
0
 def test_union(self):
     '''Union should either return None or the correct union'''
     a = intervals.Interval(5, 10)
     b = intervals.Interval(8, 15)
     c = intervals.Interval(12, 20)
     d = intervals.Interval(21, 22)
     self.assertEqual(a.union(c), None)
     self.assertEqual(c.union(a), None)
     self.assertEqual(a.union(b), intervals.Interval(5, 15))
     self.assertEqual(b.union(a), intervals.Interval(5, 15))
     self.assertEqual(c.union(d), intervals.Interval(12, 22))
     self.assertEqual(d.union(c), intervals.Interval(12, 22))
Exemple #16
0
 def test_distance_to_point(self):
     '''Test distance_to_point'''
     self.assertEqual(0, intervals.Interval(42, 50).distance_to_point(42))
     self.assertEqual(0, intervals.Interval(42, 50).distance_to_point(44))
     self.assertEqual(0, intervals.Interval(42, 50).distance_to_point(50))
     self.assertEqual(1, intervals.Interval(42, 50).distance_to_point(41))
     self.assertEqual(1, intervals.Interval(42, 50).distance_to_point(51))
     self.assertEqual(5, intervals.Interval(42, 50).distance_to_point(55))
     self.assertEqual(5, intervals.Interval(42, 50).distance_to_point(37))
Exemple #17
0
    def test_merged_coords_from_simple_nonredundant_path(self):
        '''test merged_coords_from_simple_nonredundant_path'''
        self.g.add_edge(edge.Edge('c1', 199, 0, 'c2', 1319, 1119))
        nodes = ['c1', 'c2']
        coords = self.g.merged_coords_from_simple_nonredundant_path(nodes)
        expected = [
            ['c1', intervals.Interval(200, 659), True],
            ['c2', intervals.Interval(0, 1319), True],
        ]
        self.assertListEqual(expected, coords)

        self.g = graph.Graph(self.asm)
        self.g.add_edge(edge.Edge('c1', 610, 652, 'c2', 1, 42))
        self.g.add_edge(edge.Edge('c2', 1250, 1310, 'c3', 5, 65))
        nodes = ['c1', 'c2', 'c3']
        coords = self.g.merged_coords_from_simple_nonredundant_path(nodes)
        expected = [['c1', intervals.Interval(0, 609), False],
                    ['c2', intervals.Interval(1, 1249), False],
                    ['c3', intervals.Interval(5, 2159), False]]
        self.assertListEqual(expected, coords)
Exemple #18
0
    def is_complete_orf(self):
        '''Returns true iff length is >= 6, is a multiple of 3, and there is exactly one stop codon in the sequence and it is at the end'''
        if len(self) %3 != 0 or len(self) < 6:
            return False

        orfs = self.orfs()
        complete_orf = intervals.Interval(0, len(self) - 1)
        for orf in orfs:
            if orf == complete_orf:
                return True
        return False
Exemple #19
0
    def test_merge_overlapping_in_list(self):
        '''merge_overlapping_in_list() merges correctly'''
        a = [
            intervals.Interval(1, 2),
            intervals.Interval(51, 60),
            intervals.Interval(10, 20),
            intervals.Interval(20, 30),
            intervals.Interval(20, 30),
            intervals.Interval(29, 50),
            intervals.Interval(65, 70)
        ]

        b = [
            intervals.Interval(1, 2),
            intervals.Interval(10, 60),
            intervals.Interval(65, 70)
        ]

        intervals.merge_overlapping_in_list(a)
        self.assertSequenceEqual(a, b)
Exemple #20
0
    def _node_to_coords(self, nodes, i):
        assert 0 <= i < len(nodes)
        node = nodes[i]
        if i == len(nodes) - 1:
            edges = self.graph[nodes[i - 1]][node]['edges']
        else:
            edges = self.graph[node][nodes[i + 1]]['edges']
            if i > 0:
                previous_edges = self.graph[nodes[i - 1]][node]['edges']
                assert len(previous_edges) == 1
                previous_e = previous_edges[0]
                previous_e.make_contig_first(nodes[i - 1])
                previous_open_end = previous_e.open_end(node)

        assert len(edges) == 1
        e = edges[0]
        e.make_contig_first(node)
        open_end = e.open_end(node)

        if 0 < i < len(nodes) - 1:
            assert open_end != previous_open_end
            coords = intervals.Interval(
                min(e.coords[node].start, previous_e.coords[node].start),
                max(e.coords[node].start - 1,
                    previous_e.coords[node].start - 1))
        elif i == 0:
            if open_end == edge.LEFT:
                coords = intervals.Interval(0, e.coords[node].start - 1)
            else:
                coords = intervals.Interval(e.coords[node].end + 1,
                                            self.contig_lengths[node] - 1)
        else:
            e.reverse()  # now node is second in the edge, not first
            open_end = e.open_end(node)
            if open_end == edge.LEFT:
                coords = intervals.Interval(0, e.coords[node].end)
            else:
                coords = intervals.Interval(e.coords[node].start,
                                            self.contig_lengths[node] - 1)

        return [node, coords, e.rev[node]]
Exemple #21
0
    def _set_coords(self):
        if len(self.transcripts) > 0:
            start = min([t.coords.start for t in self.transcripts.values()])
            end = max([t.coords.end for t in self.transcripts.values()])
            self.coords = intervals.Interval(start, end)
        elif self.gene_record is not None:
            self.coords = self.gene_record.coords
        else:
            raise Error('Error setting coordinates for gene ' + self.gene_id +
                        ' - cannot continue')

        if self.gene_record is not None:
            self.gene_record.coords = self.coords
    def test_init(self):
        '''Test __init__'''
        coords = intervals.Interval(42, 100)
        strand = '+'
        seqname = 'seqname'
        self.assertEqual(self.trans.coords, coords)
        self.assertEqual(self.trans.strand, strand)
        self.assertEqual(self.trans.seqname, seqname)

        for l in [self.trans.five_utr, self.trans.three_utr, self.trans.exons, self.trans.ncRNA, self.trans.rRNA, self.trans.tRNA, self.trans.snRNA]:
            self.assertEqual(len(l), 0)

        self.assertEqual(self.trans.mRNA, self.gff_mRNA)
    def _set_coords(self):
        try:
            start = min([t.coords.start for t in self.five_utr + self.three_utr + self.exons + self.ncRNA + self.rRNA + self.tRNA + self.snRNA])
            end = max([t.coords.end for t in self.five_utr + self.three_utr + self.exons + self.ncRNA + self.rRNA + self.tRNA + self.snRNA])
        except:
            if self.mRNA is not None:
                start = self.mRNA.coords.start
                end = self.mRNA.coords.end
            else:
                return

        self.coords = intervals.Interval(start, end)
        if self.mRNA is not None:
            self.mRNA.coords = self.coords
Exemple #24
0
    def test_intersects(self):
        '''Intersection of two intervals should do the right thing'''
        a = intervals.Interval(5, 10)
        no_intersect = [intervals.Interval(3, 4), intervals.Interval(11, 20)]
        intersect = [
            intervals.Interval(3, 5),
            intervals.Interval(3, 6),
            intervals.Interval(9, 12),
            intervals.Interval(10, 12),
            intervals.Interval(6, 7),
            intervals.Interval(1, 20)
        ]

        for i in no_intersect:
            self.assertFalse(a.intersects(i),
                             'shouldn\'t intersect: ' + str(a) + ', ' + str(i))

        for i in intersect:
            self.assertTrue(a.intersects(i),
                            'should intersect: ' + str(a) + ', ' + str(i))
Exemple #25
0
    def test_orfs_from_aa_seq(self):
        '''Test _orfs_from_aa_seq()'''
        test_seqs = [
            '', '*', '**', 'A', 'A*A*A', 'AB**CDE*AB', '*ABCDE*', '**ABCDE**'
        ]

        correct_coords = [[], [], [], [intervals.Interval(0, 0)],
                          [
                              intervals.Interval(0, 1),
                              intervals.Interval(2, 3),
                              intervals.Interval(4, 4)
                          ],
                          [
                              intervals.Interval(0, 2),
                              intervals.Interval(4, 7),
                              intervals.Interval(8, 9)
                          ], [intervals.Interval(1, 6)],
                          [intervals.Interval(2, 7)]]

        for i in range(len(test_seqs)):
            orfs = sequences._orfs_from_aa_seq(test_seqs[i])
            self.assertListEqual(correct_coords[i], orfs)
Exemple #26
0
    def test_gaps(self):
        '''gaps() should find the gaps in a sequence correctly'''
        test_seqs = [sequences.Fasta('ID', 'ACGT'),
                     sequences.Fasta('ID', 'NACGT'),
                     sequences.Fasta('ID', 'NACGTN'),
                     sequences.Fasta('ID', 'ANNCGT'),
                     sequences.Fasta('ID', 'NANNCGTNN')]

        correct_gaps = [[],
                        [intervals.Interval(0, 0)],
                        [intervals.Interval(0, 0), intervals.Interval(5, 5)],
                        [intervals.Interval(1, 2)],
                        [intervals.Interval(0, 0), intervals.Interval(2, 3), intervals.Interval(7, 8)]]

        for i in range(len(test_seqs)):
            gaps = test_seqs[i].gaps()
            self.assertListEqual(correct_gaps[i], gaps)
Exemple #27
0
    def orfs(self, frame=0, revcomp=False):
        assert frame in [0, 1, 2]
        if revcomp:
            self.revcomp()

        aa_seq = self.translate(frame=frame).seq.rstrip('X')
        if revcomp:
            self.revcomp()

        orfs = _orfs_from_aa_seq(aa_seq)
        for i in range(len(orfs)):
            if revcomp:
                start = len(self) - (orfs[i].end * 3 + 3) - frame
                end = len(self) - (orfs[i].start * 3) - 1 - frame
            else:
                start = orfs[i].start * 3 + frame
                end = orfs[i].end * 3 + 2 + frame

            orfs[i] = intervals.Interval(start, end)

        return orfs
    def test_intersects(self):
        '''Test intersects'''
        trans2 = copy.deepcopy(self.trans)
        not_intersects = [
            intervals.Interval(1,41),
            intervals.Interval(101,141),
        ]

        intersects = [
            intervals.Interval(1,42),
            intervals.Interval(42,50),
            intervals.Interval(50,60),
            intervals.Interval(50,100),
            intervals.Interval(100,142),
            intervals.Interval(20,424242),
        ]

        for i in not_intersects:
            trans2.coords = i
            self.assertFalse(self.trans.intersects(trans2))

        for i in intersects:
            trans2.coords = i
            self.assertTrue(self.trans.intersects(trans2))
Exemple #29
0
    def test_contains(self):
        '''Check that contains() works as expected'''
        a = intervals.Interval(5, 10)
        not_contained = [
            intervals.Interval(1, 2),
            intervals.Interval(4, 5),
            intervals.Interval(4, 10),
            intervals.Interval(4, 11),
            intervals.Interval(5, 11),
            intervals.Interval(1, 2),
            intervals.Interval(9, 11),
            intervals.Interval(10, 11),
            intervals.Interval(11, 20)
        ]

        contained = [
            intervals.Interval(5, 5),
            intervals.Interval(5, 10),
            intervals.Interval(6, 7),
            intervals.Interval(6, 10),
            intervals.Interval(10, 10)
        ]

        for i in not_contained:
            self.assertFalse(a.contains(i),
                             'shouldn\'t contain: ' + str(a) + ', ' + str(i))

        for i in contained:
            self.assertTrue(a.contains(i),
                            'should contain: ' + str(a) + ', ' + str(i))
Exemple #30
0
 def test_len(self):
     self.assertEqual(len(intervals.Interval(1, 2)), 2)
     self.assertEqual(len(intervals.Interval(1, 1)), 1)
     self.assertEqual(len(intervals.Interval(10, 20)), 11)