コード例 #1
0
    def test_one_long_one_short_sequence_separate_and_ordered_clusters(self):
        alignment = MSA([
            SeqRecord(Seq("AATTAATTATATAATAAC"), id="s1"),
            SeqRecord(Seq("A--------------AAT"), id="s2"),
        ])
        order_1 = kmeans_cluster_seqs_in_interval([0, len(alignment[0])],
                                                  alignment, 5)
        self.assertEqual(order_1, [["s1"], ["s2"]])

        order_2 = kmeans_cluster_seqs_in_interval([0, len(alignment[0])],
                                                  alignment[::-1], 5)
        self.assertEqual(order_2, [["s2"], ["s1"]])
コード例 #2
0
 def test_ambiguous_sequences_in_short_interval_separate_clusters(self):
     alignment = MSA([
         SeqRecord(Seq("ARAT"), id="s1"),
         SeqRecord(Seq("WAAT"), id="s2"),
     ])
     result = kmeans_cluster_seqs_in_interval([0, 3], alignment, 5)
     self.assertEqual([["s1"], ["s2"]], result)
コード例 #3
0
 def get_variants(self, interval) -> Sequences:
     variant_prgs = []
     if self.skip_clustering(
             interval,
             self.nesting_level,
             self.max_nesting,
             self.min_match_length,
             self.alignment,
     ):
         sub_alignment = self.alignment[:, interval.start:interval.stop + 1]
         variant_prgs = get_expanded_sequences(sub_alignment)
         logging.debug(f"Variant seqs found: {variant_prgs}")
     else:
         clustering_result = kmeans_cluster_seqs_in_interval(
             [interval.start, interval.stop],
             self.alignment,
             self.min_match_length,
         )
         if clustering_result.no_clustering:
             logging.debug(
                 "Clustering did not group any sequences together, each seq is a cluster"
             )
             variant_prgs = clustering_result.sequences
             logging.debug(f"Variant seqs found: {variant_prgs}")
         else:
             variant_prgs = self.prg_recur(interval,
                                           clustering_result.clustered_ids)
     assert len(variant_prgs) > 1, "Only have one variant seq"
     assert len(variant_prgs) == len(list(
         remove_duplicates(variant_prgs))), "have repeat variant seqs"
     return variant_prgs
コード例 #4
0
 def test_GivenManyVeryDifferentSequences_EachSeqInOwnCluster(self):
     # all 256 distinct DNA 4-mers.
     # We want clustering to keep looking for clusters, and stop at MAX_CLUSTERS
     all_4mers = list(map("".join, product(standard_bases, repeat=4)))
     alignment = make_alignment(all_4mers)
     result = kmeans_cluster_seqs_in_interval([0, 4], alignment, 4)
     self.assertEqual(len(result.clustered_ids), MAX_CLUSTERS)
コード例 #5
0
 def test_GivenRepeatedUngappedSequencesBelowKmerSize_EndUpInSameCluster(
         self):
     sequences = ["A-A-T", "CCCCC", "AA--T"]
     alignment = make_alignment(sequences)
     actual = kmeans_cluster_seqs_in_interval([0, 4], alignment, 4)
     expected = [["s0", "s2"], ["s1"]]
     self.assertEqual(actual, expected)
コード例 #6
0
    def test_first_id_in_first_cluster(self):
        alignment = make_alignment(
            [
                "AATTAATTATATAATAAC",
                "AATTAAGTATATAATAAC",
                "TTAATTAATTAATTAATT",
            ],
            ["s1", "s2", "s3"],
        )
        order_1 = kmeans_cluster_seqs_in_interval([0, len(alignment[0])], alignment, 5)
        self.assertEqual(order_1.clustered_ids, [["s1", "s2"], ["s3"]])

        order_2 = kmeans_cluster_seqs_in_interval(
            [0, len(alignment[0])], alignment[::-1], 5
        )
        self.assertEqual(order_2.clustered_ids, [["s3"], ["s2", "s1"]])
コード例 #7
0
 def test_GivenThreeSequencesAboveKmerSize_KMeansClusteringCalled(self, mockfit):
     alignment = make_alignment(["AAAT", "TTTT", "ATAT"])
     try:
         result = kmeans_cluster_seqs_in_interval([0, 3], alignment, 2)
     except ValueError:
         pass
     mockfit.assert_called_once()
コード例 #8
0
 def test_SequencesUnevenLengthIfGapsRemoved_ClusteringRuns(self):
     """If check for 'one-ref' property in clusters
     was on ungapped sequences, hamming distance computation would fail because sequences have different length"""
     sequences = ["A---T", "AAAAT", "AAA-T"]
     alignment = make_alignment(sequences)
     actual = kmeans_cluster_seqs_in_interval([0, 4], alignment, 1)
     expected = [["s0"], ["s1", "s2"]]
     self.assertEqual(actual, expected)
コード例 #9
0
 def test_TwoIdenticalSequencesClusteredTogether(self):
     alignment = MSA([
         SeqRecord(Seq("AAAT"), id="s1"),
         SeqRecord(Seq("AAAT"), id="s2"),
         SeqRecord(Seq("C-CC"), id="s3"),
     ])
     result = kmeans_cluster_seqs_in_interval([0, 3], alignment, 1)
     self.assertEqual([["s1", "s2"], ["s3"]], result)
コード例 #10
0
 def test_SequencesUnevenLengthIfGapsRemoved_ClusteringRuns(self):
     """Checking for 'one-ref' property in clusters
     needs to be on ungapped sequences (elsehamming distance computation would fail
     due to different seq length"""
     sequences = ["A---T", "AAAAT", "AAA-T"]
     alignment = make_alignment(sequences)
     actual = kmeans_cluster_seqs_in_interval([0, 4], alignment, 1)
     expected = [["s0"], ["s1", "s2"]]
     self.assertEqual(actual.clustered_ids, expected)
コード例 #11
0
 def test_GivenTwoSequenceGroups_ReturnsTwoClusters(self):
     sequences = ["CATATAAAATA", "CATATAATATA", "GGGGCGGGCCC", "GGGGCGGGCGC"]
     expected_clustering = [["s0", "s1"], ["s2", "s3"]]
     seq_size = len(sequences[0])
     alignment = make_alignment(sequences)
     for kmer_size in range(1, 7):
         result = kmeans_cluster_seqs_in_interval(
             [0, seq_size - 1], alignment, kmer_size
         )
         self.assertEqual(expected_clustering, result.clustered_ids)
コード例 #12
0
 def test_GivenAllSequencesOneSnpApart_ReturnsNoClustering(self):
     sequences = ["CATATAAAATA", "CATATAACATA", "CATATAAGATA", "CATATAATATA"]
     seq_size = len(sequences[0])
     alignment = make_alignment(sequences)
     for kmer_size in range(1, 7):
         result = kmeans_cluster_seqs_in_interval(
             [0, seq_size - 1], alignment, kmer_size
         )
         self.assertTrue(result.no_clustering)
         self.assertEqual(result.sequences, sequences)
コード例 #13
0
 def test_GivenAllSequencesOneSnpApart_ReturnsNoClustering(self):
     sequences = [
         "CATATAAAATA", "CATATAACATA", "CATATAAGATA", "CATATAATATA"
     ]
     seq_size = len(sequences[0])
     alignment = make_alignment(sequences)
     expected_clustering = [[record.id] for record in alignment]
     for kmer_size in range(1, 7):
         result = kmeans_cluster_seqs_in_interval([0, seq_size - 1],
                                                  alignment, kmer_size)
         self.assertEqual(expected_clustering, result)
コード例 #14
0
 def test_GivenAllSequencesBelowKmerSize_NoKMeansAndIdenticalSequencesClustered(
         self, mockKMeans):
     alignment = MSA([
         SeqRecord(Seq("AA---AT"), id="s1"),
         SeqRecord(Seq("AA---TT"), id="s2"),
         SeqRecord(Seq("CA--CAT"), id="s3"),
         SeqRecord(Seq("A-A--AT"), id="s4"),
     ])
     result = kmeans_cluster_seqs_in_interval([0, len(alignment[0])],
                                              alignment, 6)
     mockKMeans.assert_not_called()
     self.assertEqual([["s1", "s4"], ["s2"], ["s3"]], result)
コード例 #15
0
    def test_GivenAllSequencesBelowKmerSize_NoClustering(self, mockKMeans):
        alignment = make_alignment(
            [
                "AA---AT",
                "AA---TT",
                "CA--CAT",
                "A-A--AT",
            ]
        )

        result = kmeans_cluster_seqs_in_interval([0, len(alignment[0])], alignment, 6)
        mockKMeans.assert_not_called()
        self.assertTrue(result.no_clustering)
        self.assertEqual(result.sequences, ["AAAT", "AATT", "CACAT"])
コード例 #16
0
 def test_GivenThreeSequenceGroups_ReturnsThreeClusters(self):
     sequences = [
         "CCCCCCAACCT",
         "CCCCCCAATCT",
         "GGGGCGGGCCC",
         "GGGGCGGGCGC",
         "TTTAATTTTAA",
         "TTTAAGTTTAA",
     ]
     expected_clustering = [["s0", "s1"], ["s2", "s3"], ["s4", "s5"]]
     seq_size = len(sequences[0])
     alignment = make_alignment(sequences)
     for kmer_size in range(1, 7):
         result = kmeans_cluster_seqs_in_interval([0, seq_size - 1],
                                                  alignment, kmer_size)
         for cluster in expected_clustering:
             self.assertTrue(cluster in result)
コード例 #17
0
 def test_first_sequence_placed_in_first_cluster(self):
     """
     Runs kmeans clustering on randomly generated multiple sequence alignments
     """
     seq_len = 20
     num_seqs = 5
     bases = list(standard_bases)
     # Function has different behaviour at below and above seq_len
     for used_len in [seq_len - 5, seq_len + 5]:
         with self.subTest(kmer_size=seq_len):
             for _ in range(5):  # Run on a number of random alignments
                 sequences = [
                     "".join(random.choices(bases, k=seq_len))
                     for _ in range(num_seqs)
                 ]
                 alignment = make_alignment(sequences)
                 result = kmeans_cluster_seqs_in_interval([0, seq_len - 1],
                                                          alignment,
                                                          used_len)
                 self.assertTrue(result[0][0] == "s0")
コード例 #18
0
 def test_GivenAllSequencesSmallEditDist_ReturnsNoClustering(self):
     """Cf graph 157.pdf in issue #15"""
     sequences = [
         "gctccgccggtcccgccggtcc",
         "gctccgccgggcccgccggtcc",
         "tctccgccggtcccgccggtcc",
         "gctcagccggtcccgccggtcc",
         "gctccgccggtcccaccggtcc",
         "gctccgccggtaccgccggtcc",
         "gctccgctggtcccgccggtcc",
         "gctccgccggtcccgctggtcc",
         "gctccgccggtcccgccggtct",
         "gctccgccggtcccgcctgtcc",
         "gctccgccggtcctgccggtcc",
     ]
     seq_size = len(sequences[0])
     alignment = make_alignment(sequences)
     expected_clustering = [[record.id] for record in alignment]
     for kmer_size in range(4, 8):
         result = kmeans_cluster_seqs_in_interval([0, seq_size - 1],
                                                  alignment, kmer_size)
         self.assertEqual(expected_clustering, result)
コード例 #19
0
    def test_GivenSequencesWithSameKmerCounts_ClusteringInterrupted(self):
        """
        Sequences below are not 'one-ref-like', yet kmer counts are identical.
        This is because the sequences contain repeats and gaps, making them
        not identical from the point of view of edit distance.
        Number of clusters will try to be increased, but kmeans will only find one,
        as there is a single data point in kmer space.
        This test checks the code deals with this by aborting further clustering.
        """
        sequences = [
            "TTTTTTTGGGGGGGAAAAAAATTTTTTT-------AAAAAAATTTTTTTAAAAAAA-------",
            "-------TTTTTTTAAAAAAATTTTTTTGGGGGGGAAAAAAATTTTTTT-------AAAAAAA",
            "TTTTTTTAAAAAAATTTTTTTAAAAAAATTTTTTT-------GGGGGGG-------AAAAAAA",
        ]
        ungapped_sequences = list(map(ungap, sequences))
        distinct_kmers = count_distinct_kmers(ungapped_sequences, kmer_size=7)
        count_matrix = count_kmer_occurrences(ungapped_sequences, distinct_kmers)
        distinct_count_patterns = set(map(str, count_matrix))
        assert len(distinct_count_patterns) == 1
        assert not sequences_are_one_reference_like(sequences)

        alignment = make_alignment(sequences)
        result = kmeans_cluster_seqs_in_interval([0, len(sequences[0])], alignment, 7)
        self.assertTrue(result.no_clustering)
コード例 #20
0
 def test_one_seq_returns_single_id(self):
     alignment = MSA([SeqRecord(Seq("AAAT"), id="s1")])
     result = kmeans_cluster_seqs_in_interval([0, 3], alignment, 1)
     self.assertTrue(result.no_clustering)
コード例 #21
0
 def test_GivenTwoDifferentSeqs_NoKmeansAndTwoClusters(self, mockKMeans):
     alignment = make_alignment(["AAAT", "ATAT"])
     result = kmeans_cluster_seqs_in_interval([0, 3], alignment, 1)
     mockKMeans.assert_not_called()
     self.assertTrue(result.no_clustering)
コード例 #22
0
 def test_two_seqs_one_below_kmer_size_separate_clusters(self):
     alignment = make_alignment(["AATTTAT", "AA---AT"])
     result = kmeans_cluster_seqs_in_interval([0, 5], alignment, 5)
     self.assertEqual(result, [["s0"], ["s1"]])
コード例 #23
0
 def test_GivenLessThanTwoLongSeqs_NoClustering(self):
     sequences = ["A-A-T", "CCCCC", "AA--T"]
     alignment = make_alignment(sequences)
     result = kmeans_cluster_seqs_in_interval([0, 4], alignment, 4)
     self.assertTrue(result.no_clustering)
     self.assertEqual(result.sequences, ["AAT", "CCCCC"])
コード例 #24
0
ファイル: prg_builder.py プロジェクト: mbhall88/make_prg
    def _get_prg(self):
        prg = ""

        for interval in self.all_intervals:
            if interval in self.match_intervals:
                # all seqs are not necessarily exactly the same: some can have 'N'
                # thus still process all of them, to get the one with no 'N'.
                sub_alignment = self.alignment[:, interval.start : interval.stop + 1]
                seqs = get_interval_seqs(sub_alignment)
                assert len(seqs) == 1, "Got >1 filtered sequences in match interval"
                seq = seqs[0]
                prg += seq

            else:
                # Define variant site number and increment for next available
                site_num = self.site
                self.site += 2
                variant_prgs = []

                # Define the variant seqs to add
                if (self.nesting_level == self.max_nesting) or (
                    interval.stop - interval.start <= self.min_match_length
                ):
                    logging.debug(
                        "Have reached max nesting level or have a small variant site, so add all variant "
                        "sequences in interval."
                    )
                    sub_alignment = self.alignment[
                        :, interval.start : interval.stop + 1
                    ]
                    variant_prgs = get_interval_seqs(sub_alignment)
                    logging.debug(f"Variant seqs found: {variant_prgs}")
                else:
                    recur = True
                    id_lists = kmeans_cluster_seqs_in_interval(
                        [interval.start, interval.stop],
                        self.alignment,
                        self.min_match_length,
                    )
                    list_sub_alignments = [
                        self.get_sub_alignment_by_list_id(
                            id_list, self.alignment, [interval.start, interval.stop]
                        )
                        for id_list in id_lists
                    ]
                    num_clusters = len(id_lists)

                    if len(list_sub_alignments) == self.num_seqs:
                        logging.debug(
                            "Clustering did not group any sequences together, each seq is a cluster"
                        )
                        recur = False
                    elif interval.start not in self.subAlignedSeqs:
                        self.subAlignedSeqs[interval.start] = []
                        logging.debug(
                            "subAlignedSeqs now has keys: %s",
                            list(self.subAlignedSeqs.keys()),
                        )
                    else:
                        logging.debug(
                            "subAlignedSeqs already had key %d in keys: %s. This shouldn't happen.",
                            interval.start,
                            list(self.subAlignedSeqs.keys()),
                        )

                    while len(list_sub_alignments) > 0:
                        sub_alignment = list_sub_alignments.pop(0)
                        sub_aligned_seq = PrgBuilder(
                            msa_file=self.msa_file,
                            alignment_format=self.alignment_format,
                            max_nesting=self.max_nesting,
                            nesting_level=self.nesting_level + 1,
                            min_match_length=self.min_match_length,
                            site=self.site,
                            alignment=sub_alignment,
                            interval=interval,
                        )
                        variant_prgs.append(sub_aligned_seq.prg)
                        self.site = sub_aligned_seq.site

                        if recur:
                            self.subAlignedSeqs[interval.start].append(sub_aligned_seq)
                    assert num_clusters == len(variant_prgs), (
                        "I don't seem to have a sub-prg sequence for all parts of the partition - there are %d "
                        "classes in partition, and %d variant seqs"
                        % (num_clusters, len(variant_prgs))
                    )
                assert len(variant_prgs) > 1, "Only have one variant seq"

                assert len(variant_prgs) == len(
                    list(remove_duplicates(variant_prgs))
                ), "have repeat variant seqs"

                # Add the variant seqs to the prg.
                prg += f"{self.delim_char}{site_num}{self.delim_char}"
                while len(variant_prgs) > 1:
                    prg += variant_prgs.pop(0)
                    prg += f"{self.delim_char}{site_num + 1}{self.delim_char}"
                prg += variant_prgs.pop()
                prg += f"{self.delim_char}{site_num}{self.delim_char}"

        return prg
コード例 #25
0
 def test_GivenTwoIdenticalSeqs_NoKmeansAndOneCluster(self, mockKMeans):
     alignment = make_alignment(["AAAT", "AAAT"])
     result = kmeans_cluster_seqs_in_interval([0, 3], alignment, 1)
     mockKMeans.assert_not_called()
     self.assertEqual(result, [["s0", "s1"]])