def _get_remaining_known_ref_variants(known_ref_variants, used_ref_variants, nucmer_coords): '''Finds variants where ref has the variant and so does the contig. Which means that there was no mummer call to flag it up so need to look through the known ref variants. Also need to check that the variant is in a nucmer match to an assembly contig.''' variants = [] for ref_variant_pos, ref_variants_set in sorted(known_ref_variants.items()): for known_ref_variant in ref_variants_set: if known_ref_variant not in used_ref_variants: variant_pos_matches_contig = False pos = known_ref_variant.variant.position if known_ref_variant.seq_type == 'n': ref_interval = intervals.Interval(pos, pos) elif known_ref_variant.seq_type == 'p': ref_interval = intervals.Interval(3 * pos, 3 * pos + 2) else: raise Error('Unexpected variant type "' + known_ref_variant.variant_type + '" in _get_remaining_known_ref_variants. Cannot continue') for interval in nucmer_coords: if ref_interval.intersects(interval): variant_pos_matches_contig = True break if variant_pos_matches_contig: variants.append((None, known_ref_variant.seq_type, None, None, None, {known_ref_variant}, set())) return variants
def test_intersection(self): '''Intersection should either return None or the correct intersection''' a = intervals.Interval(5, 10) b = intervals.Interval(8, 15) c = intervals.Interval(12, 20) self.assertEqual(a.intersection(c), None) self.assertEqual(a.intersection(b), intervals.Interval(8, 10))
def test_length_sum_from_list(self): '''Test that total length of intervals is summed correctly''' a = [ intervals.Interval(1, 2), intervals.Interval(4, 5), intervals.Interval(10, 19) ] self.assertEqual(14, intervals.length_sum_from_list(a))
def __init__(self, contig1, start1, end1, contig2, start2, end2): self.names = [contig1, contig2] self.rev = {contig1: start1 > end1, contig2: start2 > end2} self.coords = { contig1: intervals.Interval(min(start1, end1), max(start1, end1)), contig2: intervals.Interval(min(start2, end2), max(start2, end2)) } if self.names[0] > self.names[1]: self.reverse()
def test_set_coords(self): '''Test set_coords''' self.trans.coords = None self.trans._set_coords() self.assertEqual(self.trans.coords, intervals.Interval(42, 100)) self.trans.add_gff_record(self.gff_exon) self.trans.coords = None self.trans._set_coords() self.assertEqual(self.trans.coords, intervals.Interval(44, 53))
def test_init(self): '''Throw error if try to construct genome_interval from a non-int, or end<start''' with self.assertRaises(intervals.Error): intervals.Interval('a', 1) with self.assertRaises(intervals.Error): intervals.Interval(1, 'a') with self.assertRaises(intervals.Error): intervals.Interval('a', 'a') with self.assertRaises(intervals.Error): intervals.Interval(3, 2)
def test_can_extend_start(self): '''Test can_extend_start()''' self.trans.add_gff_record(self.gff_exon) trans2 = copy.deepcopy(self.trans) trans2.coords = intervals.Interval(42,44) self.assertTrue(self.trans.can_extend_start(trans2)) self.assertTrue(self.trans.can_extend_start(trans2, min_extend=2)) self.assertFalse(self.trans.can_extend_start(trans2, min_extend=3)) self.assertFalse(self.trans.can_extend_start(trans2, min_extend=4)) trans2.coords = intervals.Interval(44,50) self.assertFalse(self.trans.can_extend_start(trans2))
def test_union_flll_gap(self): '''union_fill_gap() should ignore intersections and return the maximum range of coords''' a = intervals.Interval(5, 10) b = intervals.Interval(8, 15) c = intervals.Interval(12, 20) d = intervals.Interval(21, 22) self.assertEqual(a.union_fill_gap(c), intervals.Interval(5, 20)) self.assertEqual(c.union_fill_gap(a), intervals.Interval(5, 20)) self.assertEqual(a.union_fill_gap(b), intervals.Interval(5, 15)) self.assertEqual(b.union_fill_gap(a), intervals.Interval(5, 15)) self.assertEqual(c.union_fill_gap(d), intervals.Interval(12, 22)) self.assertEqual(d.union_fill_gap(c), intervals.Interval(12, 22))
def _orfs_from_aa_seq(seq): orfs = [] pos = 0 while pos < len(seq): next_stop = seq.find('*', pos) if next_stop == -1: orfs.append(intervals.Interval(pos, len(seq)-1)) break elif next_stop > pos: orfs.append(intervals.Interval(pos, next_stop)) pos = next_stop + 1 return orfs
def test_change_hit_coords_with_intersection(self): '''test test_change_hit_coords_with_intersection''' e = edge.Edge('c1', 1, 42, 'c2', 10, 50) e._change_hit_coords_with_intersection('c1', intervals.Interval(20, 30)) self.assertEqual(e, edge.Edge('c1', 20, 30, 'c2', 29, 38)) e = edge.Edge('c1', 1, 42, 'c2', 10, 50) e._change_hit_coords_with_intersection('c2', intervals.Interval(20, 30)) self.assertEqual(e, edge.Edge('c1', 11, 22, 'c2', 20, 30)) e = edge.Edge('c1', 1, 42, 'c2', 50, 10) e._change_hit_coords_with_intersection('c1', intervals.Interval(20, 30)) self.assertEqual(e, edge.Edge('c1', 20, 30, 'c2', 31, 22)) e = edge.Edge('c1', 1, 42, 'c2', 50, 10) e._change_hit_coords_with_intersection('c2', intervals.Interval(20, 30)) self.assertEqual(e, edge.Edge('c1', 21, 32, 'c2', 30, 20)) e = edge.Edge('c1', 42, 1, 'c2', 10, 50) e._change_hit_coords_with_intersection('c1', intervals.Interval(20, 30)) self.assertEqual(e, edge.Edge('c1', 30, 20, 'c2', 22, 31)) e = edge.Edge('c1', 42, 1, 'c2', 10, 50) e._change_hit_coords_with_intersection('c2', intervals.Interval(20, 30)) self.assertEqual(e, edge.Edge('c1', 32, 21, 'c2', 20, 30)) e = edge.Edge('c1', 42, 1, 'c2', 50, 10) e._change_hit_coords_with_intersection('c1', intervals.Interval(20, 30)) self.assertEqual(e, edge.Edge('c1', 30, 20, 'c2', 38, 29)) e = edge.Edge('c1', 42, 1, 'c2', 50, 10) e._change_hit_coords_with_intersection('c2', intervals.Interval(20, 30)) self.assertEqual(e, edge.Edge('c1', 22, 11, 'c2', 30, 20))
def orfs(self, frame=0, revcomp=False): '''Returns a list of ORFs that the sequence has, starting on the given frame. Each returned ORF is an interval.Interval object. If revomp=True, then finds the ORFs of the reverse complement of the sequence.''' assert frame in [0,1,2] if revcomp: self.revcomp() aa_seq = self.translate(frame=frame).seq.rstrip('X') if revcomp: self.revcomp() orfs = _orfs_from_aa_seq(aa_seq) for i in range(len(orfs)): if revcomp: start = len(self) - (orfs[i].end * 3 + 3) - frame end = len(self) - (orfs[i].start * 3) - 1 - frame else: start = orfs[i].start * 3 + frame end = orfs[i].end * 3 + 2 + frame orfs[i] = intervals.Interval(start, end) return orfs
def gaps(self, min_length = 1): '''Finds the positions of all gaps in the sequence that are at least min_length long. Returns a list of Intervals. Coords are zero-based''' gaps = [] regex = re.compile('N+', re.IGNORECASE) for m in regex.finditer(self.seq): if m.span()[1] - m.span()[0] + 1 >= min_length: gaps.append(intervals.Interval(m.span()[0], m.span()[1] - 1)) return gaps
def might_extend(self, other, min_extend=1): coords = intervals.Interval(self.coords.start - min_extend + 1, self.coords.end + min_extend - 1) strands_ok = (self.strand == other.strand and self.strand not in ['.', 'Inconsistent']) \ or (self.strand in ['-', '+'] and other.strand == '.' and len(self.exons) == len(other.exons) == 1) return self.seqname == other.seqname \ and strands_ok \ and len(self.exons) * len(other.exons) != 0 \ and (coords.intersects(other.coords) or other.coords.end + 1 == coords.start or coords.end + 1 == other.coords.start) \ and (other.coords.start < coords.start or coords.end < other.coords.end)
def contig_coords(self): '''Finds coords of contigs, i.e. everything that's not a gap (N or n). Returns a list of Intervals. Coords are zero-based''' # contigs are the opposite of gaps, so work out the coords from the gap coords gaps = self.gaps() if len(gaps) == 0: return [intervals.Interval(0, len(self) - 1)] coords = [0] for g in gaps: if g.start == 0: coords = [g.end + 1] else: coords += [g.start - 1, g.end + 1] if coords[-1] < len(self): coords.append(len(self) - 1) return [intervals.Interval(coords[i], coords[i+1]) for i in range(0, len(coords)-1,2)]
def test_union(self): '''Union should either return None or the correct union''' a = intervals.Interval(5, 10) b = intervals.Interval(8, 15) c = intervals.Interval(12, 20) d = intervals.Interval(21, 22) self.assertEqual(a.union(c), None) self.assertEqual(c.union(a), None) self.assertEqual(a.union(b), intervals.Interval(5, 15)) self.assertEqual(b.union(a), intervals.Interval(5, 15)) self.assertEqual(c.union(d), intervals.Interval(12, 22)) self.assertEqual(d.union(c), intervals.Interval(12, 22))
def test_distance_to_point(self): '''Test distance_to_point''' self.assertEqual(0, intervals.Interval(42, 50).distance_to_point(42)) self.assertEqual(0, intervals.Interval(42, 50).distance_to_point(44)) self.assertEqual(0, intervals.Interval(42, 50).distance_to_point(50)) self.assertEqual(1, intervals.Interval(42, 50).distance_to_point(41)) self.assertEqual(1, intervals.Interval(42, 50).distance_to_point(51)) self.assertEqual(5, intervals.Interval(42, 50).distance_to_point(55)) self.assertEqual(5, intervals.Interval(42, 50).distance_to_point(37))
def test_merged_coords_from_simple_nonredundant_path(self): '''test merged_coords_from_simple_nonredundant_path''' self.g.add_edge(edge.Edge('c1', 199, 0, 'c2', 1319, 1119)) nodes = ['c1', 'c2'] coords = self.g.merged_coords_from_simple_nonredundant_path(nodes) expected = [ ['c1', intervals.Interval(200, 659), True], ['c2', intervals.Interval(0, 1319), True], ] self.assertListEqual(expected, coords) self.g = graph.Graph(self.asm) self.g.add_edge(edge.Edge('c1', 610, 652, 'c2', 1, 42)) self.g.add_edge(edge.Edge('c2', 1250, 1310, 'c3', 5, 65)) nodes = ['c1', 'c2', 'c3'] coords = self.g.merged_coords_from_simple_nonredundant_path(nodes) expected = [['c1', intervals.Interval(0, 609), False], ['c2', intervals.Interval(1, 1249), False], ['c3', intervals.Interval(5, 2159), False]] self.assertListEqual(expected, coords)
def is_complete_orf(self): '''Returns true iff length is >= 6, is a multiple of 3, and there is exactly one stop codon in the sequence and it is at the end''' if len(self) %3 != 0 or len(self) < 6: return False orfs = self.orfs() complete_orf = intervals.Interval(0, len(self) - 1) for orf in orfs: if orf == complete_orf: return True return False
def test_merge_overlapping_in_list(self): '''merge_overlapping_in_list() merges correctly''' a = [ intervals.Interval(1, 2), intervals.Interval(51, 60), intervals.Interval(10, 20), intervals.Interval(20, 30), intervals.Interval(20, 30), intervals.Interval(29, 50), intervals.Interval(65, 70) ] b = [ intervals.Interval(1, 2), intervals.Interval(10, 60), intervals.Interval(65, 70) ] intervals.merge_overlapping_in_list(a) self.assertSequenceEqual(a, b)
def _node_to_coords(self, nodes, i): assert 0 <= i < len(nodes) node = nodes[i] if i == len(nodes) - 1: edges = self.graph[nodes[i - 1]][node]['edges'] else: edges = self.graph[node][nodes[i + 1]]['edges'] if i > 0: previous_edges = self.graph[nodes[i - 1]][node]['edges'] assert len(previous_edges) == 1 previous_e = previous_edges[0] previous_e.make_contig_first(nodes[i - 1]) previous_open_end = previous_e.open_end(node) assert len(edges) == 1 e = edges[0] e.make_contig_first(node) open_end = e.open_end(node) if 0 < i < len(nodes) - 1: assert open_end != previous_open_end coords = intervals.Interval( min(e.coords[node].start, previous_e.coords[node].start), max(e.coords[node].start - 1, previous_e.coords[node].start - 1)) elif i == 0: if open_end == edge.LEFT: coords = intervals.Interval(0, e.coords[node].start - 1) else: coords = intervals.Interval(e.coords[node].end + 1, self.contig_lengths[node] - 1) else: e.reverse() # now node is second in the edge, not first open_end = e.open_end(node) if open_end == edge.LEFT: coords = intervals.Interval(0, e.coords[node].end) else: coords = intervals.Interval(e.coords[node].start, self.contig_lengths[node] - 1) return [node, coords, e.rev[node]]
def _set_coords(self): if len(self.transcripts) > 0: start = min([t.coords.start for t in self.transcripts.values()]) end = max([t.coords.end for t in self.transcripts.values()]) self.coords = intervals.Interval(start, end) elif self.gene_record is not None: self.coords = self.gene_record.coords else: raise Error('Error setting coordinates for gene ' + self.gene_id + ' - cannot continue') if self.gene_record is not None: self.gene_record.coords = self.coords
def test_init(self): '''Test __init__''' coords = intervals.Interval(42, 100) strand = '+' seqname = 'seqname' self.assertEqual(self.trans.coords, coords) self.assertEqual(self.trans.strand, strand) self.assertEqual(self.trans.seqname, seqname) for l in [self.trans.five_utr, self.trans.three_utr, self.trans.exons, self.trans.ncRNA, self.trans.rRNA, self.trans.tRNA, self.trans.snRNA]: self.assertEqual(len(l), 0) self.assertEqual(self.trans.mRNA, self.gff_mRNA)
def _set_coords(self): try: start = min([t.coords.start for t in self.five_utr + self.three_utr + self.exons + self.ncRNA + self.rRNA + self.tRNA + self.snRNA]) end = max([t.coords.end for t in self.five_utr + self.three_utr + self.exons + self.ncRNA + self.rRNA + self.tRNA + self.snRNA]) except: if self.mRNA is not None: start = self.mRNA.coords.start end = self.mRNA.coords.end else: return self.coords = intervals.Interval(start, end) if self.mRNA is not None: self.mRNA.coords = self.coords
def test_intersects(self): '''Intersection of two intervals should do the right thing''' a = intervals.Interval(5, 10) no_intersect = [intervals.Interval(3, 4), intervals.Interval(11, 20)] intersect = [ intervals.Interval(3, 5), intervals.Interval(3, 6), intervals.Interval(9, 12), intervals.Interval(10, 12), intervals.Interval(6, 7), intervals.Interval(1, 20) ] for i in no_intersect: self.assertFalse(a.intersects(i), 'shouldn\'t intersect: ' + str(a) + ', ' + str(i)) for i in intersect: self.assertTrue(a.intersects(i), 'should intersect: ' + str(a) + ', ' + str(i))
def test_orfs_from_aa_seq(self): '''Test _orfs_from_aa_seq()''' test_seqs = [ '', '*', '**', 'A', 'A*A*A', 'AB**CDE*AB', '*ABCDE*', '**ABCDE**' ] correct_coords = [[], [], [], [intervals.Interval(0, 0)], [ intervals.Interval(0, 1), intervals.Interval(2, 3), intervals.Interval(4, 4) ], [ intervals.Interval(0, 2), intervals.Interval(4, 7), intervals.Interval(8, 9) ], [intervals.Interval(1, 6)], [intervals.Interval(2, 7)]] for i in range(len(test_seqs)): orfs = sequences._orfs_from_aa_seq(test_seqs[i]) self.assertListEqual(correct_coords[i], orfs)
def test_gaps(self): '''gaps() should find the gaps in a sequence correctly''' test_seqs = [sequences.Fasta('ID', 'ACGT'), sequences.Fasta('ID', 'NACGT'), sequences.Fasta('ID', 'NACGTN'), sequences.Fasta('ID', 'ANNCGT'), sequences.Fasta('ID', 'NANNCGTNN')] correct_gaps = [[], [intervals.Interval(0, 0)], [intervals.Interval(0, 0), intervals.Interval(5, 5)], [intervals.Interval(1, 2)], [intervals.Interval(0, 0), intervals.Interval(2, 3), intervals.Interval(7, 8)]] for i in range(len(test_seqs)): gaps = test_seqs[i].gaps() self.assertListEqual(correct_gaps[i], gaps)
def orfs(self, frame=0, revcomp=False): assert frame in [0, 1, 2] if revcomp: self.revcomp() aa_seq = self.translate(frame=frame).seq.rstrip('X') if revcomp: self.revcomp() orfs = _orfs_from_aa_seq(aa_seq) for i in range(len(orfs)): if revcomp: start = len(self) - (orfs[i].end * 3 + 3) - frame end = len(self) - (orfs[i].start * 3) - 1 - frame else: start = orfs[i].start * 3 + frame end = orfs[i].end * 3 + 2 + frame orfs[i] = intervals.Interval(start, end) return orfs
def test_intersects(self): '''Test intersects''' trans2 = copy.deepcopy(self.trans) not_intersects = [ intervals.Interval(1,41), intervals.Interval(101,141), ] intersects = [ intervals.Interval(1,42), intervals.Interval(42,50), intervals.Interval(50,60), intervals.Interval(50,100), intervals.Interval(100,142), intervals.Interval(20,424242), ] for i in not_intersects: trans2.coords = i self.assertFalse(self.trans.intersects(trans2)) for i in intersects: trans2.coords = i self.assertTrue(self.trans.intersects(trans2))
def test_contains(self): '''Check that contains() works as expected''' a = intervals.Interval(5, 10) not_contained = [ intervals.Interval(1, 2), intervals.Interval(4, 5), intervals.Interval(4, 10), intervals.Interval(4, 11), intervals.Interval(5, 11), intervals.Interval(1, 2), intervals.Interval(9, 11), intervals.Interval(10, 11), intervals.Interval(11, 20) ] contained = [ intervals.Interval(5, 5), intervals.Interval(5, 10), intervals.Interval(6, 7), intervals.Interval(6, 10), intervals.Interval(10, 10) ] for i in not_contained: self.assertFalse(a.contains(i), 'shouldn\'t contain: ' + str(a) + ', ' + str(i)) for i in contained: self.assertTrue(a.contains(i), 'should contain: ' + str(a) + ', ' + str(i))
def test_len(self): self.assertEqual(len(intervals.Interval(1, 2)), 2) self.assertEqual(len(intervals.Interval(1, 1)), 1) self.assertEqual(len(intervals.Interval(10, 20)), 11)