Beispiel #1
0
    def test_one_long_one_short_sequence_separate_and_ordered_clusters(self):
        alignment = MSA([
            SeqRecord(Seq("AATTAATTATATAATAAC"), id="s1"),
            SeqRecord(Seq("A--------------AAT"), id="s2"),
        ])
        order_1 = kmeans_cluster_seqs_in_interval([0, len(alignment[0])],
                                                  alignment, 5)
        self.assertEqual(order_1, [["s1"], ["s2"]])

        order_2 = kmeans_cluster_seqs_in_interval([0, len(alignment[0])],
                                                  alignment[::-1], 5)
        self.assertEqual(order_2, [["s2"], ["s1"]])
Beispiel #2
0
 def test_ambiguous_sequences_in_short_interval_separate_clusters(self):
     alignment = MSA([
         SeqRecord(Seq("ARAT"), id="s1"),
         SeqRecord(Seq("WAAT"), id="s2"),
     ])
     result = kmeans_cluster_seqs_in_interval([0, 3], alignment, 5)
     self.assertEqual([["s1"], ["s2"]], result)
Beispiel #3
0
 def get_variants(self, interval) -> Sequences:
     variant_prgs = []
     if self.skip_clustering(
             interval,
             self.nesting_level,
             self.max_nesting,
             self.min_match_length,
             self.alignment,
     ):
         sub_alignment = self.alignment[:, interval.start:interval.stop + 1]
         variant_prgs = get_expanded_sequences(sub_alignment)
         logging.debug(f"Variant seqs found: {variant_prgs}")
     else:
         clustering_result = kmeans_cluster_seqs_in_interval(
             [interval.start, interval.stop],
             self.alignment,
             self.min_match_length,
         )
         if clustering_result.no_clustering:
             logging.debug(
                 "Clustering did not group any sequences together, each seq is a cluster"
             )
             variant_prgs = clustering_result.sequences
             logging.debug(f"Variant seqs found: {variant_prgs}")
         else:
             variant_prgs = self.prg_recur(interval,
                                           clustering_result.clustered_ids)
     assert len(variant_prgs) > 1, "Only have one variant seq"
     assert len(variant_prgs) == len(list(
         remove_duplicates(variant_prgs))), "have repeat variant seqs"
     return variant_prgs
Beispiel #4
0
 def test_GivenManyVeryDifferentSequences_EachSeqInOwnCluster(self):
     # all 256 distinct DNA 4-mers.
     # We want clustering to keep looking for clusters, and stop at MAX_CLUSTERS
     all_4mers = list(map("".join, product(standard_bases, repeat=4)))
     alignment = make_alignment(all_4mers)
     result = kmeans_cluster_seqs_in_interval([0, 4], alignment, 4)
     self.assertEqual(len(result.clustered_ids), MAX_CLUSTERS)
Beispiel #5
0
 def test_GivenRepeatedUngappedSequencesBelowKmerSize_EndUpInSameCluster(
         self):
     sequences = ["A-A-T", "CCCCC", "AA--T"]
     alignment = make_alignment(sequences)
     actual = kmeans_cluster_seqs_in_interval([0, 4], alignment, 4)
     expected = [["s0", "s2"], ["s1"]]
     self.assertEqual(actual, expected)
Beispiel #6
0
    def test_first_id_in_first_cluster(self):
        alignment = make_alignment(
            [
                "AATTAATTATATAATAAC",
                "AATTAAGTATATAATAAC",
                "TTAATTAATTAATTAATT",
            ],
            ["s1", "s2", "s3"],
        )
        order_1 = kmeans_cluster_seqs_in_interval([0, len(alignment[0])], alignment, 5)
        self.assertEqual(order_1.clustered_ids, [["s1", "s2"], ["s3"]])

        order_2 = kmeans_cluster_seqs_in_interval(
            [0, len(alignment[0])], alignment[::-1], 5
        )
        self.assertEqual(order_2.clustered_ids, [["s3"], ["s2", "s1"]])
Beispiel #7
0
 def test_GivenThreeSequencesAboveKmerSize_KMeansClusteringCalled(self, mockfit):
     alignment = make_alignment(["AAAT", "TTTT", "ATAT"])
     try:
         result = kmeans_cluster_seqs_in_interval([0, 3], alignment, 2)
     except ValueError:
         pass
     mockfit.assert_called_once()
Beispiel #8
0
 def test_SequencesUnevenLengthIfGapsRemoved_ClusteringRuns(self):
     """If check for 'one-ref' property in clusters
     was on ungapped sequences, hamming distance computation would fail because sequences have different length"""
     sequences = ["A---T", "AAAAT", "AAA-T"]
     alignment = make_alignment(sequences)
     actual = kmeans_cluster_seqs_in_interval([0, 4], alignment, 1)
     expected = [["s0"], ["s1", "s2"]]
     self.assertEqual(actual, expected)
Beispiel #9
0
 def test_TwoIdenticalSequencesClusteredTogether(self):
     alignment = MSA([
         SeqRecord(Seq("AAAT"), id="s1"),
         SeqRecord(Seq("AAAT"), id="s2"),
         SeqRecord(Seq("C-CC"), id="s3"),
     ])
     result = kmeans_cluster_seqs_in_interval([0, 3], alignment, 1)
     self.assertEqual([["s1", "s2"], ["s3"]], result)
Beispiel #10
0
 def test_SequencesUnevenLengthIfGapsRemoved_ClusteringRuns(self):
     """Checking for 'one-ref' property in clusters
     needs to be on ungapped sequences (elsehamming distance computation would fail
     due to different seq length"""
     sequences = ["A---T", "AAAAT", "AAA-T"]
     alignment = make_alignment(sequences)
     actual = kmeans_cluster_seqs_in_interval([0, 4], alignment, 1)
     expected = [["s0"], ["s1", "s2"]]
     self.assertEqual(actual.clustered_ids, expected)
Beispiel #11
0
 def test_GivenTwoSequenceGroups_ReturnsTwoClusters(self):
     sequences = ["CATATAAAATA", "CATATAATATA", "GGGGCGGGCCC", "GGGGCGGGCGC"]
     expected_clustering = [["s0", "s1"], ["s2", "s3"]]
     seq_size = len(sequences[0])
     alignment = make_alignment(sequences)
     for kmer_size in range(1, 7):
         result = kmeans_cluster_seqs_in_interval(
             [0, seq_size - 1], alignment, kmer_size
         )
         self.assertEqual(expected_clustering, result.clustered_ids)
Beispiel #12
0
 def test_GivenAllSequencesOneSnpApart_ReturnsNoClustering(self):
     sequences = ["CATATAAAATA", "CATATAACATA", "CATATAAGATA", "CATATAATATA"]
     seq_size = len(sequences[0])
     alignment = make_alignment(sequences)
     for kmer_size in range(1, 7):
         result = kmeans_cluster_seqs_in_interval(
             [0, seq_size - 1], alignment, kmer_size
         )
         self.assertTrue(result.no_clustering)
         self.assertEqual(result.sequences, sequences)
Beispiel #13
0
 def test_GivenAllSequencesOneSnpApart_ReturnsNoClustering(self):
     sequences = [
         "CATATAAAATA", "CATATAACATA", "CATATAAGATA", "CATATAATATA"
     ]
     seq_size = len(sequences[0])
     alignment = make_alignment(sequences)
     expected_clustering = [[record.id] for record in alignment]
     for kmer_size in range(1, 7):
         result = kmeans_cluster_seqs_in_interval([0, seq_size - 1],
                                                  alignment, kmer_size)
         self.assertEqual(expected_clustering, result)
Beispiel #14
0
 def test_GivenAllSequencesBelowKmerSize_NoKMeansAndIdenticalSequencesClustered(
         self, mockKMeans):
     alignment = MSA([
         SeqRecord(Seq("AA---AT"), id="s1"),
         SeqRecord(Seq("AA---TT"), id="s2"),
         SeqRecord(Seq("CA--CAT"), id="s3"),
         SeqRecord(Seq("A-A--AT"), id="s4"),
     ])
     result = kmeans_cluster_seqs_in_interval([0, len(alignment[0])],
                                              alignment, 6)
     mockKMeans.assert_not_called()
     self.assertEqual([["s1", "s4"], ["s2"], ["s3"]], result)
Beispiel #15
0
    def test_GivenAllSequencesBelowKmerSize_NoClustering(self, mockKMeans):
        alignment = make_alignment(
            [
                "AA---AT",
                "AA---TT",
                "CA--CAT",
                "A-A--AT",
            ]
        )

        result = kmeans_cluster_seqs_in_interval([0, len(alignment[0])], alignment, 6)
        mockKMeans.assert_not_called()
        self.assertTrue(result.no_clustering)
        self.assertEqual(result.sequences, ["AAAT", "AATT", "CACAT"])
Beispiel #16
0
 def test_GivenThreeSequenceGroups_ReturnsThreeClusters(self):
     sequences = [
         "CCCCCCAACCT",
         "CCCCCCAATCT",
         "GGGGCGGGCCC",
         "GGGGCGGGCGC",
         "TTTAATTTTAA",
         "TTTAAGTTTAA",
     ]
     expected_clustering = [["s0", "s1"], ["s2", "s3"], ["s4", "s5"]]
     seq_size = len(sequences[0])
     alignment = make_alignment(sequences)
     for kmer_size in range(1, 7):
         result = kmeans_cluster_seqs_in_interval([0, seq_size - 1],
                                                  alignment, kmer_size)
         for cluster in expected_clustering:
             self.assertTrue(cluster in result)
Beispiel #17
0
 def test_first_sequence_placed_in_first_cluster(self):
     """
     Runs kmeans clustering on randomly generated multiple sequence alignments
     """
     seq_len = 20
     num_seqs = 5
     bases = list(standard_bases)
     # Function has different behaviour at below and above seq_len
     for used_len in [seq_len - 5, seq_len + 5]:
         with self.subTest(kmer_size=seq_len):
             for _ in range(5):  # Run on a number of random alignments
                 sequences = [
                     "".join(random.choices(bases, k=seq_len))
                     for _ in range(num_seqs)
                 ]
                 alignment = make_alignment(sequences)
                 result = kmeans_cluster_seqs_in_interval([0, seq_len - 1],
                                                          alignment,
                                                          used_len)
                 self.assertTrue(result[0][0] == "s0")
Beispiel #18
0
 def test_GivenAllSequencesSmallEditDist_ReturnsNoClustering(self):
     """Cf graph 157.pdf in issue #15"""
     sequences = [
         "gctccgccggtcccgccggtcc",
         "gctccgccgggcccgccggtcc",
         "tctccgccggtcccgccggtcc",
         "gctcagccggtcccgccggtcc",
         "gctccgccggtcccaccggtcc",
         "gctccgccggtaccgccggtcc",
         "gctccgctggtcccgccggtcc",
         "gctccgccggtcccgctggtcc",
         "gctccgccggtcccgccggtct",
         "gctccgccggtcccgcctgtcc",
         "gctccgccggtcctgccggtcc",
     ]
     seq_size = len(sequences[0])
     alignment = make_alignment(sequences)
     expected_clustering = [[record.id] for record in alignment]
     for kmer_size in range(4, 8):
         result = kmeans_cluster_seqs_in_interval([0, seq_size - 1],
                                                  alignment, kmer_size)
         self.assertEqual(expected_clustering, result)
Beispiel #19
0
    def test_GivenSequencesWithSameKmerCounts_ClusteringInterrupted(self):
        """
        Sequences below are not 'one-ref-like', yet kmer counts are identical.
        This is because the sequences contain repeats and gaps, making them
        not identical from the point of view of edit distance.
        Number of clusters will try to be increased, but kmeans will only find one,
        as there is a single data point in kmer space.
        This test checks the code deals with this by aborting further clustering.
        """
        sequences = [
            "TTTTTTTGGGGGGGAAAAAAATTTTTTT-------AAAAAAATTTTTTTAAAAAAA-------",
            "-------TTTTTTTAAAAAAATTTTTTTGGGGGGGAAAAAAATTTTTTT-------AAAAAAA",
            "TTTTTTTAAAAAAATTTTTTTAAAAAAATTTTTTT-------GGGGGGG-------AAAAAAA",
        ]
        ungapped_sequences = list(map(ungap, sequences))
        distinct_kmers = count_distinct_kmers(ungapped_sequences, kmer_size=7)
        count_matrix = count_kmer_occurrences(ungapped_sequences, distinct_kmers)
        distinct_count_patterns = set(map(str, count_matrix))
        assert len(distinct_count_patterns) == 1
        assert not sequences_are_one_reference_like(sequences)

        alignment = make_alignment(sequences)
        result = kmeans_cluster_seqs_in_interval([0, len(sequences[0])], alignment, 7)
        self.assertTrue(result.no_clustering)
Beispiel #20
0
 def test_one_seq_returns_single_id(self):
     alignment = MSA([SeqRecord(Seq("AAAT"), id="s1")])
     result = kmeans_cluster_seqs_in_interval([0, 3], alignment, 1)
     self.assertTrue(result.no_clustering)
Beispiel #21
0
 def test_GivenTwoDifferentSeqs_NoKmeansAndTwoClusters(self, mockKMeans):
     alignment = make_alignment(["AAAT", "ATAT"])
     result = kmeans_cluster_seqs_in_interval([0, 3], alignment, 1)
     mockKMeans.assert_not_called()
     self.assertTrue(result.no_clustering)
Beispiel #22
0
 def test_two_seqs_one_below_kmer_size_separate_clusters(self):
     alignment = make_alignment(["AATTTAT", "AA---AT"])
     result = kmeans_cluster_seqs_in_interval([0, 5], alignment, 5)
     self.assertEqual(result, [["s0"], ["s1"]])
Beispiel #23
0
 def test_GivenLessThanTwoLongSeqs_NoClustering(self):
     sequences = ["A-A-T", "CCCCC", "AA--T"]
     alignment = make_alignment(sequences)
     result = kmeans_cluster_seqs_in_interval([0, 4], alignment, 4)
     self.assertTrue(result.no_clustering)
     self.assertEqual(result.sequences, ["AAT", "CCCCC"])
Beispiel #24
0
    def _get_prg(self):
        prg = ""

        for interval in self.all_intervals:
            if interval in self.match_intervals:
                # all seqs are not necessarily exactly the same: some can have 'N'
                # thus still process all of them, to get the one with no 'N'.
                sub_alignment = self.alignment[:, interval.start : interval.stop + 1]
                seqs = get_interval_seqs(sub_alignment)
                assert len(seqs) == 1, "Got >1 filtered sequences in match interval"
                seq = seqs[0]
                prg += seq

            else:
                # Define variant site number and increment for next available
                site_num = self.site
                self.site += 2
                variant_prgs = []

                # Define the variant seqs to add
                if (self.nesting_level == self.max_nesting) or (
                    interval.stop - interval.start <= self.min_match_length
                ):
                    logging.debug(
                        "Have reached max nesting level or have a small variant site, so add all variant "
                        "sequences in interval."
                    )
                    sub_alignment = self.alignment[
                        :, interval.start : interval.stop + 1
                    ]
                    variant_prgs = get_interval_seqs(sub_alignment)
                    logging.debug(f"Variant seqs found: {variant_prgs}")
                else:
                    recur = True
                    id_lists = kmeans_cluster_seqs_in_interval(
                        [interval.start, interval.stop],
                        self.alignment,
                        self.min_match_length,
                    )
                    list_sub_alignments = [
                        self.get_sub_alignment_by_list_id(
                            id_list, self.alignment, [interval.start, interval.stop]
                        )
                        for id_list in id_lists
                    ]
                    num_clusters = len(id_lists)

                    if len(list_sub_alignments) == self.num_seqs:
                        logging.debug(
                            "Clustering did not group any sequences together, each seq is a cluster"
                        )
                        recur = False
                    elif interval.start not in self.subAlignedSeqs:
                        self.subAlignedSeqs[interval.start] = []
                        logging.debug(
                            "subAlignedSeqs now has keys: %s",
                            list(self.subAlignedSeqs.keys()),
                        )
                    else:
                        logging.debug(
                            "subAlignedSeqs already had key %d in keys: %s. This shouldn't happen.",
                            interval.start,
                            list(self.subAlignedSeqs.keys()),
                        )

                    while len(list_sub_alignments) > 0:
                        sub_alignment = list_sub_alignments.pop(0)
                        sub_aligned_seq = PrgBuilder(
                            msa_file=self.msa_file,
                            alignment_format=self.alignment_format,
                            max_nesting=self.max_nesting,
                            nesting_level=self.nesting_level + 1,
                            min_match_length=self.min_match_length,
                            site=self.site,
                            alignment=sub_alignment,
                            interval=interval,
                        )
                        variant_prgs.append(sub_aligned_seq.prg)
                        self.site = sub_aligned_seq.site

                        if recur:
                            self.subAlignedSeqs[interval.start].append(sub_aligned_seq)
                    assert num_clusters == len(variant_prgs), (
                        "I don't seem to have a sub-prg sequence for all parts of the partition - there are %d "
                        "classes in partition, and %d variant seqs"
                        % (num_clusters, len(variant_prgs))
                    )
                assert len(variant_prgs) > 1, "Only have one variant seq"

                assert len(variant_prgs) == len(
                    list(remove_duplicates(variant_prgs))
                ), "have repeat variant seqs"

                # Add the variant seqs to the prg.
                prg += f"{self.delim_char}{site_num}{self.delim_char}"
                while len(variant_prgs) > 1:
                    prg += variant_prgs.pop(0)
                    prg += f"{self.delim_char}{site_num + 1}{self.delim_char}"
                prg += variant_prgs.pop()
                prg += f"{self.delim_char}{site_num}{self.delim_char}"

        return prg
Beispiel #25
0
 def test_GivenTwoIdenticalSeqs_NoKmeansAndOneCluster(self, mockKMeans):
     alignment = make_alignment(["AAAT", "AAAT"])
     result = kmeans_cluster_seqs_in_interval([0, 3], alignment, 1)
     mockKMeans.assert_not_called()
     self.assertEqual(result, [["s0", "s1"]])