Beispiel #1
0
    def test_KeepChars(self):
        """KeepChars returns a string containing only chars in keep"""
        f = KeepChars("ab c3*[")
        self.assertEqual(f(""), "")  # empty
        self.assertRaises(TypeError, f, None)  # None

        # one character, case sensitive
        self.assertEqual(f("b"), "b")
        self.assertEqual(f("g"), "")
        self.assertEqual(f("xyz123"), "3")
        self.assertEqual(f("xyz  123"), "  3")

        # more characters, case sensitive
        self.assertEqual(f("kjbwherzcagebcujrkcs"), "bcabcc")
        self.assertEqual(f("f[ffff*ff*fff3fff"), "[**3")

        # case insensitive
        f = KeepChars("AbC", False)
        self.assertEqual(f("abcdef"), "abc")
        self.assertEqual(f("ABCDEF"), "ABC")
        self.assertEqual(f("aBcDeF"), "aBc")
Beispiel #2
0
    def __init__(
        self,
        motifset,
        gap=IUPAC_gap,
        missing=IUPAC_missing,
        gaps=None,
        seq_constructor=None,
        ambiguities=None,
        label=None,
        complements=None,
        pairs=None,
        mw_calculator=None,
        add_lower=False,
        preserve_existing_moltypes=False,
        make_alphabet_group=False,
        array_seq_constructor=None,
        colors=None,
    ):
        """Returns a new MolType object. Note that the parameters are in flux.

        Parameters
        ----------
        motifset
            Alphabet or sequence of items in the default
            alphabet. Does not include degenerates.
        gap
            default gap symbol
        missing
            symbol for missing data
        gaps
            any other symbols that should be treated as gaps (doesn't have
            to include gap or missing; they will be silently added)
        seq_constructor
            Class for constructing sequences.
        ambiguities
            dict of char:tuple, doesn't include gaps (these are
            hard-coded as - and ?, and added later.
        label
            text label, don't know what this is used for. Unnecessary?
        complements
            dict of symbol:symbol showing how the non-degenerate
            single characters complement each other. Used for constructing
            on the fly the complement table, incl. support for must_pair and
            can_pair.
        pairs
            dict in which keys are pairs of symbols that can pair
            with each other, values are True (must pair) or False (might
            pair). Currently, the meaning of GU pairs as 'weak' is conflated
            with the meaning of degenerate symbol pairs (which might pair
            with each other but don't necessarily, depending on how the
            symbol is resolved). This should be refactored.
        mw_calculator
            f(seq) -> molecular weight.
        add_lower
            if True (default: False) adds the lowercase versions of
            everything into the alphabet. Slated for deletion.
        preserve_existing_moltypes
            if True (default: False), does not
            set the MolType of the things added in **kwargs to self.
        make_alphabet_group
            if True, makes an AlphabetGroup relating
            the various alphabets to one another.
        array_seq_constructor
            sequence type for array sequence
        colors
            dict mapping moltype characters to colors for display

        Note on "degenerates" versus "ambiguities": self.degenerates contains
        _only_ mappings for degenerate symbols, whereas self.ambiguities
        contains mappings for both degenerate and non-degenerate symbols.
        Sometimes you want one, sometimes the other, so both are provided.
        """
        self._serialisable = {k: v for k, v in locals().items() if k != "self"}
        self.gap = gap
        self.missing = missing
        self.gaps = frozenset([gap, missing])
        if gaps:
            self.gaps = self.gaps.union(frozenset(gaps))
        self.label = label
        # set the sequence constructor
        if seq_constructor is None:
            seq_constructor = "".join  # safe default string constructor
        elif not preserve_existing_moltypes:
            seq_constructor.moltype = self
        self._make_seq = seq_constructor

        # set the ambiguities
        ambigs = {
            self.missing: tuple(motifset) + (self.gap, ),
            self.gap: (self.gap, )
        }
        if ambiguities:
            ambigs.update(ambiguities)
        for c in motifset:
            ambigs[c] = (c, )
        self.ambiguities = ambigs

        # set complements -- must set before we make the alphabet group
        self.complements = complements or {}

        if make_alphabet_group:  # note: must use _original_ ambiguities here
            self.alphabets = AlphabetGroup(motifset, ambiguities, moltype=self)
            self.alphabet = self.alphabets.base
        else:
            if isinstance(motifset, Enumeration):
                self.alphabet = motifset
            elif max(len(motif) for motif in motifset) == 1:
                self.alphabet = CharAlphabet(motifset, moltype=self)
            else:
                self.alphabet = Alphabet(motifset, moltype=self)
        # set the other properties
        self.degenerates = ambiguities and ambiguities.copy() or {}
        self.degenerates[self.missing] = "".join(motifset) + self.gap
        self.matches = make_matches(motifset, self.gaps, self.degenerates)
        self.pairs = pairs and pairs.copy() or {}
        self.pairs.update(
            make_pairs(pairs, motifset, self.gaps, self.degenerates))
        self.mw_calculator = mw_calculator

        # add lowercase characters, if we're doing that
        if add_lower:
            self._add_lowercase()
        # cache various other data that make the calculations faster
        self._make_all()
        self._make_comp_table()
        # a gap can be a true gap char or a degenerate character, typically '?'
        # we therefore want to ensure consistent treatment across the definition
        # of characters as either gap or degenerate
        self.gap_string = "".join(self.gaps)
        strict_gap = "".join(set(self.gap_string) - set(self.degenerates))
        self.strip_degenerate = FunctionWrapper(
            KeepChars(strict_gap + "".join(self.alphabet)))
        self.strip_bad = FunctionWrapper(KeepChars("".join(self.All)))
        to_keep = set(self.alphabet) ^ set(self.degenerates) - set(self.gaps)
        self.strip_bad_and_gaps = FunctionWrapper(KeepChars("".join(to_keep)))

        # make inverse degenerates from degenerates
        # ensure that lowercase versions also exist if appropriate
        inv_degens = {}
        for key, val in list(self.degenerates.items()):
            inv_degens[frozenset(val)] = key.upper()
            if add_lower:
                inv_degens[frozenset("".join(val).lower())] = key.lower()
        for m in self.alphabet:
            inv_degens[frozenset(m)] = m
            if add_lower:
                inv_degens[frozenset("".join(m).lower())] = m.lower()
        for m in self.gaps:
            inv_degens[frozenset(m)] = m
        self.inverse_degenerates = inv_degens

        # set array type for modeling alphabets
        try:
            self.array_type = self.alphabet.array_type
        except AttributeError:
            self.array_type = None

        # set modeling sequence
        self._make_array_seq = array_seq_constructor

        self._colors = colors or defaultdict(_DefaultValue("black"))