def test_intersection(self): '''Intersection should either return None or the correct intersection''' a = intervals.Interval(5, 10) b = intervals.Interval(8, 15) c = intervals.Interval(12, 20) self.assertEqual(a.intersection(c), None) self.assertEqual(a.intersection(b), intervals.Interval(8, 10))
def test_length_sum_from_list(self): '''Test that total length of intervals is summed correctly''' a = [ intervals.Interval(1, 2), intervals.Interval(4, 5), intervals.Interval(10, 19) ] self.assertEqual(14, intervals.length_sum_from_list(a))
def test_init(self): '''Throw error if try to construct genome_interval from a non-int, or end<start''' with self.assertRaises(intervals.Error): intervals.Interval('a', 1) with self.assertRaises(intervals.Error): intervals.Interval(1, 'a') with self.assertRaises(intervals.Error): intervals.Interval('a', 'a') with self.assertRaises(intervals.Error): intervals.Interval(3, 2)
def test_union_flll_gap(self): '''union_fill_gap() should ignore intersections and return the maximum range of coords''' a = intervals.Interval(5, 10) b = intervals.Interval(8, 15) c = intervals.Interval(12, 20) d = intervals.Interval(21, 22) self.assertEqual(a.union_fill_gap(c), intervals.Interval(5, 20)) self.assertEqual(c.union_fill_gap(a), intervals.Interval(5, 20)) self.assertEqual(a.union_fill_gap(b), intervals.Interval(5, 15)) self.assertEqual(b.union_fill_gap(a), intervals.Interval(5, 15)) self.assertEqual(c.union_fill_gap(d), intervals.Interval(12, 22)) self.assertEqual(d.union_fill_gap(c), intervals.Interval(12, 22))
def _orfs_from_aa_seq(seq): orfs = [] pos = 0 while pos < len(seq): next_stop = seq.find('*', pos) if next_stop == -1: orfs.append(intervals.Interval(pos, len(seq) - 1)) break elif next_stop > pos: orfs.append(intervals.Interval(pos, next_stop)) pos = next_stop + 1 return orfs
def gaps(self, min_length=1): '''Finds the positions of all gaps in the sequence that are at least min_length long. Returns a list of Intervals. Coords are zero-based''' gaps = [] regex = re.compile('N+', re.IGNORECASE) for m in regex.finditer(self.seq): if m.span()[1] - m.span()[0] + 1 >= min_length: gaps.append(intervals.Interval(m.span()[0], m.span()[1] - 1)) return gaps
def test_union(self): '''Union should either return None or the correct union''' a = intervals.Interval(5, 10) b = intervals.Interval(8, 15) c = intervals.Interval(12, 20) d = intervals.Interval(21, 22) self.assertEqual(a.union(c), None) self.assertEqual(c.union(a), None) self.assertEqual(a.union(b), intervals.Interval(5, 15)) self.assertEqual(b.union(a), intervals.Interval(5, 15)) self.assertEqual(c.union(d), intervals.Interval(12, 22)) self.assertEqual(d.union(c), intervals.Interval(12, 22))
def test_merge_overlapping_in_list(self): '''merge_overlapping_in_list() merges correctly''' a = [ intervals.Interval(1, 2), intervals.Interval(51, 60), intervals.Interval(10, 20), intervals.Interval(20, 30), intervals.Interval(20, 30), intervals.Interval(29, 50), intervals.Interval(65, 70) ] b = [ intervals.Interval(1, 2), intervals.Interval(10, 60), intervals.Interval(65, 70) ] intervals.merge_overlapping_in_list(a) self.assertSequenceEqual(a, b)
def contig_coords(self): '''Finds coords of contigs, i.e. everything that's not a gap (N or n). Returns a list of Intervals. Coords are zero-based''' # contigs are the opposite of gaps, so work out the coords from the gap coords gaps = self.gaps() if len(gaps) == 0: return [intervals.Interval(0, len(self) - 1)] coords = [0] for g in gaps: if g.start == 0: coords = [g.end + 1] else: coords += [g.start - 1, g.end + 1] if coords[-1] < len(self): coords.append(len(self) - 1) return [ intervals.Interval(coords[i], coords[i + 1]) for i in range(0, len(coords) - 1, 2) ]
def test_intersects(self): '''Intersection of two intervals should do the right thing''' a = intervals.Interval(5, 10) no_intersect = [intervals.Interval(3, 4), intervals.Interval(11, 20)] intersect = [ intervals.Interval(3, 5), intervals.Interval(3, 6), intervals.Interval(9, 12), intervals.Interval(10, 12), intervals.Interval(6, 7), intervals.Interval(1, 20) ] for i in no_intersect: self.assertFalse(a.intersects(i), 'shouldn\'t intersect: ' + str(a) + ', ' + str(i)) for i in intersect: self.assertTrue(a.intersects(i), 'should intersect: ' + str(a) + ', ' + str(i))
def test_orfs_from_aa_seq(self): '''Test _orfs_from_aa_seq()''' test_seqs = [ '', '*', '**', 'A', 'A*A*A', 'AB**CDE*AB', '*ABCDE*', '**ABCDE**' ] correct_coords = [[], [], [], [intervals.Interval(0, 0)], [ intervals.Interval(0, 1), intervals.Interval(2, 3), intervals.Interval(4, 4) ], [ intervals.Interval(0, 2), intervals.Interval(4, 7), intervals.Interval(8, 9) ], [intervals.Interval(1, 6)], [intervals.Interval(2, 7)]] for i in range(len(test_seqs)): orfs = sequences._orfs_from_aa_seq(test_seqs[i]) self.assertListEqual(correct_coords[i], orfs)
def orfs(self, frame=0, revcomp=False): assert frame in [0, 1, 2] if revcomp: self.revcomp() aa_seq = self.translate(frame=frame).seq.rstrip('X') if revcomp: self.revcomp() orfs = _orfs_from_aa_seq(aa_seq) for i in range(len(orfs)): if revcomp: start = len(self) - (orfs[i].end * 3 + 3) - frame end = len(self) - (orfs[i].start * 3) - 1 - frame else: start = orfs[i].start * 3 + frame end = orfs[i].end * 3 + 2 + frame orfs[i] = intervals.Interval(start, end) return orfs
def test_gaps(self): '''gaps() should find the gaps in a sequence correctly''' test_seqs = [ sequences.Fasta('ID', 'ACGT'), sequences.Fasta('ID', 'NACGT'), sequences.Fasta('ID', 'NACGTN'), sequences.Fasta('ID', 'ANNCGT'), sequences.Fasta('ID', 'NANNCGTNN') ] correct_gaps = [[], [intervals.Interval(0, 0)], [intervals.Interval(0, 0), intervals.Interval(5, 5)], [intervals.Interval(1, 2)], [ intervals.Interval(0, 0), intervals.Interval(2, 3), intervals.Interval(7, 8) ]] for i in range(len(test_seqs)): gaps = test_seqs[i].gaps() self.assertListEqual(correct_gaps[i], gaps)
def test_comparisons(self): '''<, <=, == should work as expected''' self.assertTrue(intervals.Interval(1, 2) < intervals.Interval(2, 2)) self.assertTrue(intervals.Interval(1, 2) <= intervals.Interval(2, 2)) self.assertFalse(intervals.Interval(2, 2) <= intervals.Interval(1, 2)) self.assertFalse(intervals.Interval(2, 2) < intervals.Interval(1, 2)) self.assertFalse(intervals.Interval(2, 2) < intervals.Interval(2, 2)) self.assertTrue(intervals.Interval(1, 2) == intervals.Interval(1, 2)) self.assertFalse(intervals.Interval(1, 2) == intervals.Interval(1, 3)) self.assertTrue(intervals.Interval(1, 2) != intervals.Interval(1, 3)) self.assertFalse(intervals.Interval(1, 2) != intervals.Interval(1, 2))
def test_all_orfs(self): '''Test all_orfs()''' d = {} tasks.file_to_dict(os.path.join(data_dir, 'sequences_test_orfs.fa'), d) seq = d['1'] orfs = seq.all_orfs(min_length=120) expected = [(intervals.Interval(27, 221), False), (intervals.Interval(44, 226), False), (intervals.Interval(48, 170), True), (intervals.Interval(109, 240), False), (intervals.Interval(143, 265), True), (intervals.Interval(227, 421), False), (intervals.Interval(277, 432), True), (intervals.Interval(286, 477), False), (intervals.Interval(288, 518), True), (intervals.Interval(562, 702), False), (intervals.Interval(600, 758), False), (intervals.Interval(605, 817), False), (intervals.Interval(818, 937), False), (intervals.Interval(835, 987), False), (intervals.Interval(864, 998), False)] self.assertEqual(len(orfs), len(expected)) for i in range(len(orfs)): print(orfs[i][0], expected[i][0]) self.assertEqual(orfs[i][0], expected[i][0]) self.assertEqual(orfs[i][1], expected[i][1])
def test_orfs(self): '''Test orfs()''' test_seqs = [ (sequences.Fasta('ID', 'AAACCCGG'), 0, False, [intervals.Interval(0, 5)]), (sequences.Fasta('ID', 'AAAACCCGG'), 1, False, [intervals.Interval(1, 6)]), (sequences.Fasta('ID', 'AAAAACCCGG'), 2, False, [intervals.Interval(2, 7)]), (sequences.Fasta('ID', 'CCGGGTTT'), 0, True, [intervals.Interval(2, 7)]), (sequences.Fasta('ID', 'CCGGGTTTT'), 1, True, [intervals.Interval(2, 7)]), (sequences.Fasta('ID', 'CCGGGTTTTT'), 2, True, [intervals.Interval(2, 7)]), (sequences.Fasta('ID', 'AAACCCTGA'), 0, False, [intervals.Interval(0, 8)]), (sequences.Fasta('ID', 'AAACCCTGATAG'), 0, False, [intervals.Interval(0, 8)]), (sequences.Fasta('ID', 'AAACCCTGA'), 1, False, [intervals.Interval(1, 6)]), (sequences.Fasta('ID', ''), 0, False, []), (sequences.Fasta('ID', 'A'), 0, False, []), (sequences.Fasta('ID', 'AA'), 0, False, []), (sequences.Fasta('ID', 'AAA'), 0, False, [intervals.Interval(0, 2)]), (sequences.Fasta('ID', 'AAAAAA'), 0, False, [intervals.Interval(0, 5)]), (sequences.Fasta('ID', 'AAA'), 1, False, []), (sequences.Fasta('ID', 'AAA'), 2, False, []), (sequences.Fasta('ID', 'AAA'), 0, True, [intervals.Interval(0, 2)]), (sequences.Fasta('ID', 'AAA'), 1, True, []), (sequences.Fasta('ID', 'AAA'), 2, True, []), (sequences.Fasta('ID', 'TAA'), 0, False, []), (sequences.Fasta('ID', 'CTA'), 0, True, []) ] for t in test_seqs: orfs = t[0].orfs(frame=t[1], revcomp=t[2]) self.assertListEqual(orfs, t[3])
def test_contig_coords(self): '''contig_coords() should get the coords of all contigs in a sequence correctly''' test_seqs = [ sequences.Fasta('ID', 'ACGT'), sequences.Fasta('ID', 'NACGT'), sequences.Fasta('ID', 'NNACGT'), sequences.Fasta('ID', 'ACGTN'), sequences.Fasta('ID', 'ACGTNN'), sequences.Fasta('ID', 'NANNCGT'), sequences.Fasta('ID', 'ACNNNGTNA'), sequences.Fasta('ID', 'ANNCGTNNAAAAA') ] correct_coords = [[intervals.Interval(0, 3)], [intervals.Interval(1, 4)], [intervals.Interval(2, 5)], [intervals.Interval(0, 3)], [intervals.Interval(0, 3)], [intervals.Interval(1, 1), intervals.Interval(4, 6)], [ intervals.Interval(0, 1), intervals.Interval(5, 6), intervals.Interval(8, 8) ], [ intervals.Interval(0, 0), intervals.Interval(3, 5), intervals.Interval(8, 12) ]] for i in range(len(test_seqs)): gaps = test_seqs[i].contig_coords() self.assertListEqual(correct_coords[i], gaps)
def test_len(self): self.assertEqual(len(intervals.Interval(1, 2)), 2) self.assertEqual(len(intervals.Interval(1, 1)), 1) self.assertEqual(len(intervals.Interval(10, 20)), 11)
def test_contains(self): '''Check that contains() works as expected''' a = intervals.Interval(5, 10) not_contained = [ intervals.Interval(1, 2), intervals.Interval(4, 5), intervals.Interval(4, 10), intervals.Interval(4, 11), intervals.Interval(5, 11), intervals.Interval(1, 2), intervals.Interval(9, 11), intervals.Interval(10, 11), intervals.Interval(11, 20) ] contained = [ intervals.Interval(5, 5), intervals.Interval(5, 10), intervals.Interval(6, 7), intervals.Interval(6, 10), intervals.Interval(10, 10) ] for i in not_contained: self.assertFalse(a.contains(i), 'shouldn\'t contain: ' + str(a) + ', ' + str(i)) for i in contained: self.assertTrue(a.contains(i), 'should contain: ' + str(a) + ', ' + str(i))
def test_remove_contained_in_list(self): '''test_remove_contained_in_list removes the right elements of list''' a = [ intervals.Interval(1, 2), intervals.Interval(4, 4), intervals.Interval(4, 5), intervals.Interval(5, 6), intervals.Interval(7, 9), intervals.Interval(8, 10), intervals.Interval(9, 11), intervals.Interval(20, 25), intervals.Interval(20, 24), intervals.Interval(20, 26), intervals.Interval(30, 38), intervals.Interval(30, 37), intervals.Interval(30, 36), intervals.Interval(30, 35), intervals.Interval(30, 35), intervals.Interval(32, 33), intervals.Interval(38, 50), intervals.Interval(65, 70), intervals.Interval(67, 70) ] b = [ intervals.Interval(1, 2), intervals.Interval(4, 5), intervals.Interval(5, 6), intervals.Interval(7, 9), intervals.Interval(8, 10), intervals.Interval(9, 11), intervals.Interval(20, 26), intervals.Interval(30, 38), intervals.Interval(38, 50), intervals.Interval(65, 70) ] intervals.remove_contained_in_list(a) self.assertSequenceEqual(a, b)
def test_intersection(self): '''intersection() should correctly intersect two lists of intervals''' a = [ intervals.Interval(1, 2), intervals.Interval(10, 20), intervals.Interval(51, 52), intervals.Interval(54, 55), intervals.Interval(57, 58) ] b = [ intervals.Interval(5, 6), intervals.Interval(9, 11), intervals.Interval(13, 14), intervals.Interval(17, 18), intervals.Interval(20, 25), intervals.Interval(50, 60) ] c = [intervals.Interval(100, 200)] i = [ intervals.Interval(10, 11), intervals.Interval(13, 14), intervals.Interval(17, 18), intervals.Interval(20, 20), intervals.Interval(51, 52), intervals.Interval(54, 55), intervals.Interval(57, 58) ] self.assertSequenceEqual(intervals.intersection(a, b), i) self.assertSequenceEqual(intervals.intersection(b, a), i) self.assertSequenceEqual(intervals.intersection(c, a), []) self.assertEqual(intervals.intersection([], a), []) self.assertEqual(intervals.intersection(a, []), [])