Esempio n. 1
0
 def testInputShorterThanKmer(self):
     """
     Check for the case where the input sequece is smaller than the kmer size
     """
     INPUT_STR = "AGGCCTA"
     INPUT_KMER_LEN = 200
     with self.assertRaises(RuntimeError):
         annotation_utils.compute_gc_content(INPUT_STR, INPUT_KMER_LEN)
Esempio n. 2
0
 def testInvalidKmer(self):
     """
     Test for an invalid kmer length
     """
     INPUT_STR = "AGGCCTA"
     INPUT_KMER_LEN = -2
     with self.assertRaises(RuntimeError):
         annotation_utils.compute_gc_content(INPUT_STR, INPUT_KMER_LEN)
Esempio n. 3
0
 def testHappyPath(self):
     """
     Test standard path
     """
     INPUT_STR = "AGGCCTA"
     INPUT_KMER_LEN = 2
     EXPECTED_RESULTS = [[0.5], [1.0], [1.0], [1.0], [0.5], [0.0]]
     results = annotation_utils.compute_gc_content(INPUT_STR, INPUT_KMER_LEN)
     self.assertListEqual(results, EXPECTED_RESULTS)
    def set_sequence_composition_stats(self, sequence):
        """
        Given the contig compute GC stats over the interval

        :param str sequence: The contig
        """
        subsequence = sequence[self.start:self.stop]
        self._gc_content = utils.compute_gc_content(subsequence,
                                                    len(subsequence))[0][0]
        self._gc_bases = round(len(subsequence) * self._gc_content)
        self._at_bases = round(len(subsequence) - self._gc_bases)
    def _compute_state_sequence(self, input_sequence):
        """
        Annotate the states of sequence

        :param str input_sequence: Input sequence
        :return list(int): List of sequence states for each position in the sequence
        """

        self._label_mapping = {}
        gc_content_sequence = utils.compute_gc_content(
            input_sequence, kmer_size=self._kmer_length)
        states = utils.kmeans(gc_content_sequence,
                              number_of_clusters=self._NUMBER_OF_STATES)

        first_state = states[0]
        last_state = states[-1]

        # we want GC content to be in the middle of the window
        for i in xrange(self._kmer_length / 2):
            states.insert(0, first_state)

        # since this is a sliding window approach there will be a coupled unlabeled position just
        # continue with the final state
        while len(states) < len(input_sequence):
            states.append(last_state)

        candidates = {}
        for i in xrange(len(states)):
            if len(candidates) == 2:
                break
            if not states[i] in candidates:
                candidates[states[i]] = gc_content_sequence[i][0]

        # find the kmers with the highest GC content and cache it for later use when the intervals are formed
        # This will be much quicker then labeling each state
        keys = candidates.keys()
        if len(candidates) == 1:
            self._label_mapping[keys[0]] = self.UNKNOWN_LABEL

        elif len(candidates) == 2:
            if candidates[keys[0]] > candidates[keys[1]]:
                self._label_mapping[keys[0]] = self.HIGH_GC_CONTENT_LABEL
                self._label_mapping[keys[1]] = self.LOW_GC_CONTENT_LABEL
            else:
                self._label_mapping[keys[0]] = self.LOW_GC_CONTENT_LABEL
                self._label_mapping[keys[1]] = self.HIGH_GC_CONTENT_LABEL

        return states