コード例 #1
0
 def test_avoid_empty_alleles_previous_non_match_merged(self):
     """Edge case of collapsed match interval, part 2"""
     msa = make_alignment(["CCTTAGGTTT", "AATTA--TTT"])
     tester = IntervalPartitioner("**TTA**TTT", min_match_length=3, alignment=msa)
     match, non_match, _ = tester.get_intervals()
     self.assertEqual(match, make_typed_intervals([[7, 9]], Match))
     self.assertEqual(non_match, make_typed_intervals([[0, 6]], NonMatch))
コード例 #2
0
 def test_GivenThreeSequencesAboveKmerSize_KMeansClusteringCalled(self, mockfit):
     alignment = make_alignment(["AAAT", "TTTT", "ATAT"])
     try:
         result = kmeans_cluster_seqs_in_interval([0, 3], alignment, 2)
     except ValueError:
         pass
     mockfit.assert_called_once()
コード例 #3
0
 def test_GivenRepeatedUngappedSequencesBelowKmerSize_EndUpInSameCluster(
         self):
     sequences = ["A-A-T", "CCCCC", "AA--T"]
     alignment = make_alignment(sequences)
     actual = kmeans_cluster_seqs_in_interval([0, 4], alignment, 4)
     expected = [["s0", "s2"], ["s1"]]
     self.assertEqual(actual, expected)
コード例 #4
0
 def test_N_special_treatment(self):
     """
     i)A and N at pos 2 are different, but still consensus
     ii)N and N at pos 0 are same, but not consensus"""
     alignment = make_alignment(["NTN", "NTA"])
     result = PrgBuilder.get_consensus(alignment)
     self.assertEqual(result, "*TA")
コード例 #5
0
 def test_GivenManyVeryDifferentSequences_EachSeqInOwnCluster(self):
     # all 256 distinct DNA 4-mers.
     # We want clustering to keep looking for clusters, and stop at MAX_CLUSTERS
     all_4mers = list(map("".join, product(standard_bases, repeat=4)))
     alignment = make_alignment(all_4mers)
     result = kmeans_cluster_seqs_in_interval([0, 4], alignment, 4)
     self.assertEqual(len(result.clustered_ids), MAX_CLUSTERS)
コード例 #6
0
 def test_SequencesUnevenLengthIfGapsRemoved_ClusteringRuns(self):
     """If check for 'one-ref' property in clusters
     was on ungapped sequences, hamming distance computation would fail because sequences have different length"""
     sequences = ["A---T", "AAAAT", "AAA-T"]
     alignment = make_alignment(sequences)
     actual = kmeans_cluster_seqs_in_interval([0, 4], alignment, 1)
     expected = [["s0"], ["s1", "s2"]]
     self.assertEqual(actual, expected)
コード例 #7
0
 def test_SequencesUnevenLengthIfGapsRemoved_ClusteringRuns(self):
     """Checking for 'one-ref' property in clusters
     needs to be on ungapped sequences (elsehamming distance computation would fail
     due to different seq length"""
     sequences = ["A---T", "AAAAT", "AAA-T"]
     alignment = make_alignment(sequences)
     actual = kmeans_cluster_seqs_in_interval([0, 4], alignment, 1)
     expected = [["s0"], ["s1", "s2"]]
     self.assertEqual(actual.clustered_ids, expected)
コード例 #8
0
 def test_nonmatch_interval_switching_indels(self):
     """Because the sequences are the same, despite different alignment"""
     alignment = make_alignment(["A---A", "A-A--"])
     match_intervals, non_match_intervals = make_intervals([], [[0, 5]])
     IntervalPartitioner.enforce_multisequence_nonmatch_intervals(
         match_intervals, non_match_intervals, alignment
     )
     self.assertEqual(match_intervals, make_typed_intervals([[0, 5]], Match))
     self.assertEqual(non_match_intervals, [])
コード例 #9
0
 def test_nonmatch_interval_switching_Ns(self):
     """'N's make sequences get removed"""
     alignment = make_alignment(["ANAAA", "ATAAT"])
     match_intervals, non_match_intervals = make_intervals([], [[0, 5]])
     IntervalPartitioner.enforce_multisequence_nonmatch_intervals(
         match_intervals, non_match_intervals, alignment
     )
     self.assertEqual(match_intervals, make_typed_intervals([[0, 5]], Match))
     self.assertEqual(non_match_intervals, [])
コード例 #10
0
 def test_GivenAllSequencesOneSnpApart_ReturnsNoClustering(self):
     sequences = ["CATATAAAATA", "CATATAACATA", "CATATAAGATA", "CATATAATATA"]
     seq_size = len(sequences[0])
     alignment = make_alignment(sequences)
     for kmer_size in range(1, 7):
         result = kmeans_cluster_seqs_in_interval(
             [0, seq_size - 1], alignment, kmer_size
         )
         self.assertTrue(result.no_clustering)
         self.assertEqual(result.sequences, sequences)
コード例 #11
0
 def test_GivenTwoSequenceGroups_ReturnsTwoClusters(self):
     sequences = ["CATATAAAATA", "CATATAATATA", "GGGGCGGGCCC", "GGGGCGGGCGC"]
     expected_clustering = [["s0", "s1"], ["s2", "s3"]]
     seq_size = len(sequences[0])
     alignment = make_alignment(sequences)
     for kmer_size in range(1, 7):
         result = kmeans_cluster_seqs_in_interval(
             [0, seq_size - 1], alignment, kmer_size
         )
         self.assertEqual(expected_clustering, result.clustered_ids)
コード例 #12
0
 def test_avoid_empty_alleles_short_match(self):
     """
     Padding behaviour also expected, but now the leading match interval becomes too
     short and collapses to a non_match interval
     """
     msa = make_alignment(["TTAGGTTT", "TTA--TTT"])
     tester = IntervalPartitioner("TTA**TTT", min_match_length=3, alignment=msa)
     match, non_match, _ = tester.get_intervals()
     self.assertEqual(match, make_typed_intervals([[5, 7]], Match))
     self.assertEqual(non_match, make_typed_intervals([[0, 4]], NonMatch))
コード例 #13
0
 def test_avoid_empty_alleles_long_match(self):
     """
     If we let the non-match interval be only [4,5],
     this would result in an empty allele in the prg,
     so require padding using the preceding match sequence
     """
     msa = make_alignment(["TTAAGGTTT", "TTAA--TTT"])
     tester = IntervalPartitioner("TTAA**TTT", min_match_length=3, alignment=msa)
     match, non_match, _ = tester.get_intervals()
     self.assertEqual(match, make_typed_intervals([[0, 2], [6, 8]], Match))
     self.assertEqual(non_match, make_typed_intervals([[3, 5]], NonMatch))
コード例 #14
0
 def test_GivenAllSequencesOneSnpApart_ReturnsNoClustering(self):
     sequences = [
         "CATATAAAATA", "CATATAACATA", "CATATAAGATA", "CATATAATATA"
     ]
     seq_size = len(sequences[0])
     alignment = make_alignment(sequences)
     expected_clustering = [[record.id] for record in alignment]
     for kmer_size in range(1, 7):
         result = kmeans_cluster_seqs_in_interval([0, seq_size - 1],
                                                  alignment, kmer_size)
         self.assertEqual(expected_clustering, result)
コード例 #15
0
 def test_ambiguous_alignment_skip_clustering(self):
     """
     `added_seq` below is an equally valid alignment as "A--TTTTA" to the sequence
     "ATTAATTA"
     If we have such ambiguous alignments (defined as more than one gapped alignment
     corresponding to the same ungapped sequence), we choose not to cluster the
     alignment, as it can create ambiguous graphs (whereby different paths spell same sequence)
     """
     added_seq = "ATTTT--A"
     self.tested_params["alignment"] = make_alignment(self.aligned_seqs +
                                                      [added_seq])
     self.assertTrue(PrgBuilder.skip_clustering(**self.tested_params))
コード例 #16
0
 def setUp(self):
     """
     Set of parameters whereby clustering is to be performed.
     We'll modify each of them in turn
     """
     self.aligned_seqs = ["ATTTTTTA", "A--TTTTA", "ATTTCTTA"]
     self.tested_params = {
         "interval": Interval(IntervalType.Match, 0, 7),
         "max_nesting": 2,
         "nesting_level": 1,
         "min_match_length": 2,
         "alignment": make_alignment(self.aligned_seqs),
     }
コード例 #17
0
    def test_GivenAllSequencesBelowKmerSize_NoClustering(self, mockKMeans):
        alignment = make_alignment(
            [
                "AA---AT",
                "AA---TT",
                "CA--CAT",
                "A-A--AT",
            ]
        )

        result = kmeans_cluster_seqs_in_interval([0, len(alignment[0])], alignment, 6)
        mockKMeans.assert_not_called()
        self.assertTrue(result.no_clustering)
        self.assertEqual(result.sequences, ["AAAT", "AATT", "CACAT"])
コード例 #18
0
    def test_first_id_in_first_cluster(self):
        alignment = make_alignment(
            [
                "AATTAATTATATAATAAC",
                "AATTAAGTATATAATAAC",
                "TTAATTAATTAATTAATT",
            ],
            ["s1", "s2", "s3"],
        )
        order_1 = kmeans_cluster_seqs_in_interval([0, len(alignment[0])], alignment, 5)
        self.assertEqual(order_1.clustered_ids, [["s1", "s2"], ["s3"]])

        order_2 = kmeans_cluster_seqs_in_interval(
            [0, len(alignment[0])], alignment[::-1], 5
        )
        self.assertEqual(order_2.clustered_ids, [["s3"], ["s2", "s1"]])
コード例 #19
0
 def test_GivenThreeSequenceGroups_ReturnsThreeClusters(self):
     sequences = [
         "CCCCCCAACCT",
         "CCCCCCAATCT",
         "GGGGCGGGCCC",
         "GGGGCGGGCGC",
         "TTTAATTTTAA",
         "TTTAAGTTTAA",
     ]
     expected_clustering = [["s0", "s1"], ["s2", "s3"], ["s4", "s5"]]
     seq_size = len(sequences[0])
     alignment = make_alignment(sequences)
     for kmer_size in range(1, 7):
         result = kmeans_cluster_seqs_in_interval([0, seq_size - 1],
                                                  alignment, kmer_size)
         for cluster in expected_clustering:
             self.assertTrue(cluster in result)
コード例 #20
0
 def test_first_sequence_placed_in_first_cluster(self):
     """
     Runs kmeans clustering on randomly generated multiple sequence alignments
     """
     seq_len = 20
     num_seqs = 5
     bases = list(standard_bases)
     # Function has different behaviour at below and above seq_len
     for used_len in [seq_len - 5, seq_len + 5]:
         with self.subTest(kmer_size=seq_len):
             for _ in range(5):  # Run on a number of random alignments
                 sequences = [
                     "".join(random.choices(bases, k=seq_len))
                     for _ in range(num_seqs)
                 ]
                 alignment = make_alignment(sequences)
                 result = kmeans_cluster_seqs_in_interval([0, seq_len - 1],
                                                          alignment,
                                                          used_len)
                 self.assertTrue(result[0][0] == "s0")
コード例 #21
0
 def test_GivenAllSequencesSmallEditDist_ReturnsNoClustering(self):
     """Cf graph 157.pdf in issue #15"""
     sequences = [
         "gctccgccggtcccgccggtcc",
         "gctccgccgggcccgccggtcc",
         "tctccgccggtcccgccggtcc",
         "gctcagccggtcccgccggtcc",
         "gctccgccggtcccaccggtcc",
         "gctccgccggtaccgccggtcc",
         "gctccgctggtcccgccggtcc",
         "gctccgccggtcccgctggtcc",
         "gctccgccggtcccgccggtct",
         "gctccgccggtcccgcctgtcc",
         "gctccgccggtcctgccggtcc",
     ]
     seq_size = len(sequences[0])
     alignment = make_alignment(sequences)
     expected_clustering = [[record.id] for record in alignment]
     for kmer_size in range(4, 8):
         result = kmeans_cluster_seqs_in_interval([0, seq_size - 1],
                                                  alignment, kmer_size)
         self.assertEqual(expected_clustering, result)
コード例 #22
0
    def test_GivenSequencesWithSameKmerCounts_ClusteringInterrupted(self):
        """
        Sequences below are not 'one-ref-like', yet kmer counts are identical.
        This is because the sequences contain repeats and gaps, making them
        not identical from the point of view of edit distance.
        Number of clusters will try to be increased, but kmeans will only find one,
        as there is a single data point in kmer space.
        This test checks the code deals with this by aborting further clustering.
        """
        sequences = [
            "TTTTTTTGGGGGGGAAAAAAATTTTTTT-------AAAAAAATTTTTTTAAAAAAA-------",
            "-------TTTTTTTAAAAAAATTTTTTTGGGGGGGAAAAAAATTTTTTT-------AAAAAAA",
            "TTTTTTTAAAAAAATTTTTTTAAAAAAATTTTTTT-------GGGGGGG-------AAAAAAA",
        ]
        ungapped_sequences = list(map(ungap, sequences))
        distinct_kmers = count_distinct_kmers(ungapped_sequences, kmer_size=7)
        count_matrix = count_kmer_occurrences(ungapped_sequences, distinct_kmers)
        distinct_count_patterns = set(map(str, count_matrix))
        assert len(distinct_count_patterns) == 1
        assert not sequences_are_one_reference_like(sequences)

        alignment = make_alignment(sequences)
        result = kmeans_cluster_seqs_in_interval([0, len(sequences[0])], alignment, 7)
        self.assertTrue(result.no_clustering)
コード例 #23
0
 def test_sub_alignment_with_empty_sequence(self):
     msa = make_alignment(["TTAGGTTT", "TTA--TTT", "GGA-TTTT"])
     self.assertTrue(has_empty_sequence(msa, [3, 4]))
コード例 #24
0
 def setUpClass(cls):
     cls.alignment = make_alignment(["AAAT", "C--C", "AATT", "GNGG"],
                                    ["s1", "s2", "s3", "s4"])
コード例 #25
0
 def test_get_subalignment_with_interval(self):
     result = PrgBuilder.get_sub_alignment_by_list_id(["s2", "s3"],
                                                      self.alignment,
                                                      [0, 2])
     expected = make_alignment(["C--", "AAT"], ["s2", "s3"])
     self.assertTrue(msas_equal(expected, result))
コード例 #26
0
 def test_all_gap_nonmatch(self):
     alignment = make_alignment(["A--A", "A--A"])
     result = PrgBuilder.get_consensus(alignment)
     self.assertEqual(result, "A**A")
コード例 #27
0
 def test_count_alignment_seqs(self):
     msa = make_alignment(self.input_seqs)
     result = count(get_alignment_seqs(msa))
     self.assertEqual(result, 3)
コード例 #28
0
 def test_IUPACAmbiguous_nonmatch(self):
     alignment = make_alignment(["RYA", "RTA"])
     result = PrgBuilder.get_consensus(alignment)
     self.assertEqual(result, "**A")
コード例 #29
0
 def test_mixed_match_nonmatch(self):
     alignment = make_alignment(["AAGTA", "CATTA"])
     result = PrgBuilder.get_consensus(alignment)
     self.assertEqual(result, "*A*TA")
コード例 #30
0
 def test_first_sequence_in_is_first_sequence_out(self):
     alignment = make_alignment(["TTTT", "AAAA", "CC-C"])
     result = get_expanded_sequences(alignment)
     expected = ["TTTT", "AAAA", "CCC"]
     self.assertEqual(expected, result)