Ejemplo n.º 1
0
    def test_string_with_many_gaps_returns_original_without_gap(self):
        seq = "A-CGT--"

        actual = remove_gaps(seq)
        expected = "ACGT"

        self.assertEqual(actual, expected)
Ejemplo n.º 2
0
    def test_empty_string_returns_empty(self):
        seq = ""

        actual = remove_gaps(seq)
        expected = ""

        self.assertEqual(actual, expected)
Ejemplo n.º 3
0
    def test_string_with_no_gaps_returns_original(self):
        seq = "ACGT"

        actual = remove_gaps(seq)
        expected = seq

        self.assertEqual(actual, expected)
Ejemplo n.º 4
0
    def test_all_input_space_doesnt_break(self, seq):
        actual = remove_gaps(seq)

        self.assertFalse("-" in actual)
Ejemplo n.º 5
0
    def interval_partition(self):
        """Return a list of intervals in which we have
        consensus sequence longer than min_match_length, and
        a list of the non-match intervals left."""
        match_intervals = []
        non_match_intervals = []
        match_count, match_start, non_match_start = 0, 0, 0

        logging.debug("consensus: %s" % self.consensus)
        for i in range(self.length):
            letter = self.consensus[i]
            if letter != "*":
                # In a match region.
                if match_count == 0:
                    match_start = i
                match_count += 1
            elif match_count > 0:
                # Have reached a non-match. Check if previous match string is long enough to add to match_regions
                match_string = remove_gaps(
                    self.consensus[match_start:match_start + match_count])
                match_len = len(match_string)
                logging.debug("have match string %s" % match_string)

                if match_len >= self.min_match_length:
                    if non_match_start < match_start:
                        non_match_intervals.append(
                            [non_match_start, match_start - 1])
                        logging.debug(
                            f"add non-match interval [{non_match_start},{match_start - 1}]"
                        )
                    end = match_start + match_count - 1
                    match_intervals.append([match_start, end])
                    logging.debug(f"add match interval [{match_start},{end}]")
                    non_match_start = i
                match_count = 0
                match_start = non_match_start

        end = self.length - 1
        if self.length < self.min_match_length:
            # Special case: a short sequence can still get classified as a match interval
            added_interval = "match" if "*" in self.consensus else "non_match"
            if added_interval == "match":
                match_intervals.append([0, end])
            else:
                non_match_intervals.append([0, end])
            logging.debug(
                f"add whole short {added_interval} interval [0,{end}]")
            match_count = 0
            non_match_start = end + 1

        # At end add last intervals
        if match_count > 0:
            if match_count >= self.min_match_length:
                match_intervals.append([match_start, end])
                logging.debug(
                    f"add final match interval [{match_start},{end}]")
                if non_match_start < match_start:
                    end = match_start - 1
        if match_count != self.length and non_match_start <= end:
            non_match_intervals.append([non_match_start, end])
            logging.debug(f"add non-match interval [{non_match_start},{end}]")

        # check all stretches of consensus are in an interval, and intervals don't overlap
        for i in range(self.length):
            count_match = 0
            for interval in match_intervals:
                if interval[0] <= i <= interval[1]:
                    count_match += 1
            count_non_match = 0
            for interval in non_match_intervals:
                if interval[0] <= i <= interval[1]:
                    count_non_match += 1

            assert count_match | count_non_match, (
                "Failed to correctly identify match intervals: position %d "
                "appeared in both/neither match and non-match intervals" % i)
            assert count_match + count_non_match == 1, (
                "Failed to correctly identify match intervals: position "
                "%d appeared in %d intervals" %
                (i, count_match + count_non_match))

        return match_intervals, non_match_intervals