def test_string_with_many_gaps_returns_original_without_gap(self): seq = "A-CGT--" actual = remove_gaps(seq) expected = "ACGT" self.assertEqual(actual, expected)
def test_empty_string_returns_empty(self): seq = "" actual = remove_gaps(seq) expected = "" self.assertEqual(actual, expected)
def test_string_with_no_gaps_returns_original(self): seq = "ACGT" actual = remove_gaps(seq) expected = seq self.assertEqual(actual, expected)
def test_all_input_space_doesnt_break(self, seq): actual = remove_gaps(seq) self.assertFalse("-" in actual)
def interval_partition(self): """Return a list of intervals in which we have consensus sequence longer than min_match_length, and a list of the non-match intervals left.""" match_intervals = [] non_match_intervals = [] match_count, match_start, non_match_start = 0, 0, 0 logging.debug("consensus: %s" % self.consensus) for i in range(self.length): letter = self.consensus[i] if letter != "*": # In a match region. if match_count == 0: match_start = i match_count += 1 elif match_count > 0: # Have reached a non-match. Check if previous match string is long enough to add to match_regions match_string = remove_gaps( self.consensus[match_start:match_start + match_count]) match_len = len(match_string) logging.debug("have match string %s" % match_string) if match_len >= self.min_match_length: if non_match_start < match_start: non_match_intervals.append( [non_match_start, match_start - 1]) logging.debug( f"add non-match interval [{non_match_start},{match_start - 1}]" ) end = match_start + match_count - 1 match_intervals.append([match_start, end]) logging.debug(f"add match interval [{match_start},{end}]") non_match_start = i match_count = 0 match_start = non_match_start end = self.length - 1 if self.length < self.min_match_length: # Special case: a short sequence can still get classified as a match interval added_interval = "match" if "*" in self.consensus else "non_match" if added_interval == "match": match_intervals.append([0, end]) else: non_match_intervals.append([0, end]) logging.debug( f"add whole short {added_interval} interval [0,{end}]") match_count = 0 non_match_start = end + 1 # At end add last intervals if match_count > 0: if match_count >= self.min_match_length: match_intervals.append([match_start, end]) logging.debug( f"add final match interval [{match_start},{end}]") if non_match_start < match_start: end = match_start - 1 if match_count != self.length and non_match_start <= end: non_match_intervals.append([non_match_start, end]) logging.debug(f"add non-match interval [{non_match_start},{end}]") # check all stretches of consensus are in an interval, and intervals don't overlap for i in range(self.length): count_match = 0 for interval in match_intervals: if interval[0] <= i <= interval[1]: count_match += 1 count_non_match = 0 for interval in non_match_intervals: if interval[0] <= i <= interval[1]: count_non_match += 1 assert count_match | count_non_match, ( "Failed to correctly identify match intervals: position %d " "appeared in both/neither match and non-match intervals" % i) assert count_match + count_non_match == 1, ( "Failed to correctly identify match intervals: position " "%d appeared in %d intervals" % (i, count_match + count_non_match)) return match_intervals, non_match_intervals