Esempio n. 1
0
 def test_GivenLongSequencesWithDiffAboveThreshold_AreNotRefLike(self):
     sequences = ["AATTA", "AATTT", "TATTA", "GCGGG"]
     seqlen = len(sequences[0])
     self.assertTrue(seqlen >= LENGTH_THRESHOLD)
     self.assertEqual(get_one_ref_like_threshold_distance(seqlen), 1)
     self.assertEqual(get_majority_string(sequences), "AATTA")
     self.assertFalse(sequences_are_one_reference_like(sequences))
Esempio n. 2
0
    def test_GivenLongSequencesWithDiffBelowThreshold_AreRefLike(self):
        sequences = ["AATTA", "AATTT", "TATTA"]
        # Sequences are above length threshold
        seqlen = len(sequences[0])
        self.assertTrue(seqlen >= LENGTH_THRESHOLD)
        # We tolerate up to one diff against majority string
        self.assertEqual(get_one_ref_like_threshold_distance(seqlen), 1)

        # Here's the majority string
        self.assertEqual(get_majority_string(sequences), "AATTA")

        # No sequence is > distance threshold from the majority string
        self.assertTrue(sequences_are_one_reference_like(sequences))
Esempio n. 3
0
 def test_GivenLongSequencesFromRealDataWithDiffBelowThreshold_AreRefLike(self):
     """This data appeared in issue #15 and used to get clustered, producing ambiguous graphs with paths spelling the same sequence"""
     sequences = [
         "gctccgccggtcccgccggtcc",
         "gctccgccgggcccgccggtcc",
         "tctccgccggtcccgccggtcc",
         "gctcagccggtcccgccggtcc",
         "gctccgccggtcccaccggtcc",
         "gctccgccggtaccgccggtcc",
         "gctccgctggtcccgccggtcc",
         "gctccgccggtcccgctggtcc",
         "gctccgccggtcccgccggtct",
         "gctccgccggtcccgcctgtcc",
         "gctccgccggtcctgccggtcc",
     ]
     seqlen = len(sequences[0])
     self.assertEqual(get_one_ref_like_threshold_distance(seqlen), 4)
     self.assertTrue(sequences_are_one_reference_like(sequences))
Esempio n. 4
0
    def test_GivenSequencesWithSameKmerCounts_ClusteringInterrupted(self):
        """
        Sequences below are not 'one-ref-like', yet kmer counts are identical.
        This is because the sequences contain repeats and gaps, making them
        not identical from the point of view of edit distance.
        Number of clusters will try to be increased, but kmeans will only find one,
        as there is a single data point in kmer space.
        This test checks the code deals with this by aborting further clustering.
        """
        sequences = [
            "TTTTTTTGGGGGGGAAAAAAATTTTTTT-------AAAAAAATTTTTTTAAAAAAA-------",
            "-------TTTTTTTAAAAAAATTTTTTTGGGGGGGAAAAAAATTTTTTT-------AAAAAAA",
            "TTTTTTTAAAAAAATTTTTTTAAAAAAATTTTTTT-------GGGGGGG-------AAAAAAA",
        ]
        ungapped_sequences = list(map(ungap, sequences))
        distinct_kmers = count_distinct_kmers(ungapped_sequences, kmer_size=7)
        count_matrix = count_kmer_occurrences(ungapped_sequences, distinct_kmers)
        distinct_count_patterns = set(map(str, count_matrix))
        assert len(distinct_count_patterns) == 1
        assert not sequences_are_one_reference_like(sequences)

        alignment = make_alignment(sequences)
        result = kmeans_cluster_seqs_in_interval([0, len(sequences[0])], alignment, 7)
        self.assertTrue(result.no_clustering)
Esempio n. 5
0
 def test_GivenShortSeqsWithMoreThanOneDiff_AreNotRefLike(self):
     sequences = ["AA", "TT", "CC"]
     seqlen = len(sequences[0])
     self.assertTrue(seqlen < LENGTH_THRESHOLD)
     self.assertEqual(get_one_ref_like_threshold_distance(seqlen), 1)
     self.assertFalse(sequences_are_one_reference_like(sequences))
Esempio n. 6
0
 def test_GivenSnpsOnly_AreRefLike(self):
     sequences = ["A", "T", "C"]
     self.assertTrue(sequences_are_one_reference_like(sequences))
Esempio n. 7
0
 def test_GivenSequencesWithDifferentLengths_Fails(self):
     sequences = ["AT", "AA", "CCC"]
     with self.assertRaises(ValueError):
         sequences_are_one_reference_like(sequences)