コード例 #1
0
    def _get_prg(self):
        prg = ""
        for interval in self.all_intervals:
            if interval in self.match_intervals:
                # all seqs are not necessarily exactly the same: some can have 'N'
                # thus still process all of them, to get the one with no 'N'.
                sub_alignment = self.alignment[:,
                                               interval.start:interval.stop +
                                               1]
                seqs = get_expanded_sequences(sub_alignment)
                assert len(
                    seqs) == 1, "Got >1 filtered sequences in match interval"
                seq = seqs[0]
                prg += seq
            else:
                # Define variant site number and increment for next available
                site_num = self.site
                self.site += 2
                variant_prgs = self.get_variants(interval)

                # Add the variant seqs to the prg.
                prg += f"{self.delim_char}{site_num}{self.delim_char}"
                while len(variant_prgs) > 1:
                    prg += variant_prgs.pop(0)
                    prg += f"{self.delim_char}{site_num + 1}{self.delim_char}"
                prg += variant_prgs.pop()
                prg += f"{self.delim_char}{site_num}{self.delim_char}"
        return prg
コード例 #2
0
 def enforce_multisequence_nonmatch_intervals(
         cls, match_intervals: Intervals, non_match_intervals: Intervals,
         alignment: MSA) -> None:
     """
     Goes through non-match intervals and makes sure there is more than one sequence there, else makes it a match
     interval.
     Modifies the intervals in-place.
     Example reasons for such a conversion to occur:
         - 'N' in a sequence causes it to be filtered out, and left with a single useable sequence
         - '-' in sequences causes them to appear different, but they are the same
     """
     if len(alignment) == 0:  # For testing convenience
         return
     for i in reversed(range(len(non_match_intervals))):
         interval = non_match_intervals[i]
         interval_alignment = alignment[:, interval.start:interval.stop + 1]
         interval_seqs = get_expanded_sequences(interval_alignment)
         if len(interval_seqs) < 2:
             changed_interval = non_match_intervals[i]
             match_intervals.append(
                 Interval(
                     IntervalType.Match,
                     changed_interval.start,
                     changed_interval.stop,
                 ))
             non_match_intervals.pop(i)
コード例 #3
0
 def get_variants(self, interval) -> Sequences:
     variant_prgs = []
     if self.skip_clustering(
             interval,
             self.nesting_level,
             self.max_nesting,
             self.min_match_length,
             self.alignment,
     ):
         sub_alignment = self.alignment[:, interval.start:interval.stop + 1]
         variant_prgs = get_expanded_sequences(sub_alignment)
         logging.debug(f"Variant seqs found: {variant_prgs}")
     else:
         clustering_result = kmeans_cluster_seqs_in_interval(
             [interval.start, interval.stop],
             self.alignment,
             self.min_match_length,
         )
         if clustering_result.no_clustering:
             logging.debug(
                 "Clustering did not group any sequences together, each seq is a cluster"
             )
             variant_prgs = clustering_result.sequences
             logging.debug(f"Variant seqs found: {variant_prgs}")
         else:
             variant_prgs = self.prg_recur(interval,
                                           clustering_result.clustered_ids)
     assert len(variant_prgs) > 1, "Only have one variant seq"
     assert len(variant_prgs) == len(list(
         remove_duplicates(variant_prgs))), "have repeat variant seqs"
     return variant_prgs
コード例 #4
0
 def test_first_sequence_in_is_first_sequence_out(self):
     alignment = make_alignment(["TTTT", "AAAA", "CC-C"])
     result = get_expanded_sequences(alignment)
     expected = ["TTTT", "AAAA", "CCC"]
     self.assertEqual(expected, result)
コード例 #5
0
 def test_ambiguous_bases_one_seq_with_repeated_base(self):
     alignment = AlignIO.MultipleSeqAlignment([SeqRecord(Seq("RRAAT"))])
     result = get_expanded_sequences(alignment)
     expected = {"GAAAT", "AAAAT", "GGAAT", "AGAAT"}
     self.assertEqual(set(result), expected)