Beispiel #1
0
 def test_pickling(self):
     r = CharAlphabet("UCAG")
     wa = r.get_word_alphabet(2)
     pkl = pickle.dumps(r)
     got = pickle.loads(pkl)
     self.assertIsInstance(got, type(r))
     self.assertEqual(got.get_word_alphabet(2), wa)
Beispiel #2
0
 def test_is_valid(self):
     """CharAlphabet is_valid should return True for valid sequence"""
     a = CharAlphabet("bca")
     self.assertEqual(a.is_valid(""), True)
     self.assertEqual(a.is_valid("bbb"), True)
     self.assertEqual(a.is_valid("bbbaac"), True)
     self.assertEqual(a.is_valid("bbd"), False)
     self.assertEqual(a.is_valid("d"), False)
     self.assertEqual(a.is_valid(["a", "b"]), True)
     self.assertEqual(a.is_valid(["a", None]), False)
Beispiel #3
0
 def test_triples(self):
     """triples should cache the same object."""
     r = CharAlphabet("UCAG")
     rt = r.Triples
     self.assertEqual(len(rt), 64)
     rt2 = r.Triples
     self.assertIs(rt, rt2)
Beispiel #4
0
 def test_pairs(self):
     """pairs should cache the same object."""
     r = CharAlphabet("UCAG")
     rp = r.pairs
     self.assertEqual(len(rp), 16)
     rp2 = r.pairs
     self.assertIs(rp, rp2)
Beispiel #5
0
 def test_init(self):
     """CharAlphabet init should make correct translation tables"""
     r = CharAlphabet("UCAG")
     i2c, c2i = r._indices_nums_to_chars, r._chars_to_indices
     s = array([0, 0, 1, 0, 3, 2], "b").tostring()
     self.assertEqual(s.translate(i2c), b"UUCUGA")
     self.assertEqual("UUCUGA".translate(c2i), "\000\000\001\000\003\002")
Beispiel #6
0
 def test_to_string(self):
     """CharAlphabet to_string should convert an input array to string"""
     r = CharAlphabet("UCAG")
     self.assertEqual(r.to_string(array([[0, 0, 1], [0, 3, 2]], "B")), "UUC\nUGA")
     # should work with single seq
     self.assertEqual(r.to_string(array([[0, 0, 1, 0, 3, 2]], "B")), "UUCUGA")
     # should work with single seq
     self.assertEqual(r.to_string(array([0, 0, 1, 0, 3, 2], "B")), "UUCUGA")
     # should work with empty seq
     self.assertEqual(r.to_string(array([], "B")), "")
Beispiel #7
0
    def __init__(
        self,
        motifset,
        gap=IUPAC_gap,
        missing=IUPAC_missing,
        gaps=None,
        seq_constructor=None,
        ambiguities=None,
        label=None,
        complements=None,
        pairs=None,
        mw_calculator=None,
        add_lower=False,
        preserve_existing_moltypes=False,
        make_alphabet_group=False,
        array_seq_constructor=None,
        colors=None,
    ):
        """Returns a new MolType object. Note that the parameters are in flux.

        Parameters
        ----------
        motifset
            Alphabet or sequence of items in the default
            alphabet. Does not include degenerates.
        gap
            default gap symbol
        missing
            symbol for missing data
        gaps
            any other symbols that should be treated as gaps (doesn't have
            to include gap or missing; they will be silently added)
        seq_constructor
            Class for constructing sequences.
        ambiguities
            dict of char:tuple, doesn't include gaps (these are
            hard-coded as - and ?, and added later.
        label
            text label, don't know what this is used for. Unnecessary?
        complements
            dict of symbol:symbol showing how the non-degenerate
            single characters complement each other. Used for constructing
            on the fly the complement table, incl. support for must_pair and
            can_pair.
        pairs
            dict in which keys are pairs of symbols that can pair
            with each other, values are True (must pair) or False (might
            pair). Currently, the meaning of GU pairs as 'weak' is conflated
            with the meaning of degenerate symbol pairs (which might pair
            with each other but don't necessarily, depending on how the
            symbol is resolved). This should be refactored.
        mw_calculator
            f(seq) -> molecular weight.
        add_lower
            if True (default: False) adds the lowercase versions of
            everything into the alphabet. Slated for deletion.
        preserve_existing_moltypes
            if True (default: False), does not
            set the MolType of the things added in **kwargs to self.
        make_alphabet_group
            if True, makes an AlphabetGroup relating
            the various alphabets to one another.
        array_seq_constructor
            sequence type for array sequence
        colors
            dict mapping moltype characters to colors for display

        Note on "degenerates" versus "ambiguities": self.degenerates contains
        _only_ mappings for degenerate symbols, whereas self.ambiguities
        contains mappings for both degenerate and non-degenerate symbols.
        Sometimes you want one, sometimes the other, so both are provided.
        """
        self._serialisable = {k: v for k, v in locals().items() if k != "self"}
        self.gap = gap
        self.missing = missing
        self.gaps = frozenset([gap, missing])
        if gaps:
            self.gaps = self.gaps.union(frozenset(gaps))
        self.label = label
        # set the sequence constructor
        if seq_constructor is None:
            seq_constructor = "".join  # safe default string constructor
        elif not preserve_existing_moltypes:
            seq_constructor.moltype = self
        self._make_seq = seq_constructor

        # set the ambiguities
        ambigs = {
            self.missing: tuple(motifset) + (self.gap, ),
            self.gap: (self.gap, )
        }
        if ambiguities:
            ambigs.update(ambiguities)
        for c in motifset:
            ambigs[c] = (c, )
        self.ambiguities = ambigs

        # set complements -- must set before we make the alphabet group
        self.complements = complements or {}

        if make_alphabet_group:  # note: must use _original_ ambiguities here
            self.alphabets = AlphabetGroup(motifset, ambiguities, moltype=self)
            self.alphabet = self.alphabets.base
        else:
            if isinstance(motifset, Enumeration):
                self.alphabet = motifset
            elif max(len(motif) for motif in motifset) == 1:
                self.alphabet = CharAlphabet(motifset, moltype=self)
            else:
                self.alphabet = Alphabet(motifset, moltype=self)
        # set the other properties
        self.degenerates = ambiguities and ambiguities.copy() or {}
        self.degenerates[self.missing] = "".join(motifset) + self.gap
        self.matches = make_matches(motifset, self.gaps, self.degenerates)
        self.pairs = pairs and pairs.copy() or {}
        self.pairs.update(
            make_pairs(pairs, motifset, self.gaps, self.degenerates))
        self.mw_calculator = mw_calculator

        # add lowercase characters, if we're doing that
        if add_lower:
            self._add_lowercase()
        # cache various other data that make the calculations faster
        self._make_all()
        self._make_comp_table()
        # a gap can be a true gap char or a degenerate character, typically '?'
        # we therefore want to ensure consistent treatment across the definition
        # of characters as either gap or degenerate
        self.gap_string = "".join(self.gaps)
        strict_gap = "".join(set(self.gap_string) - set(self.degenerates))
        self.strip_degenerate = FunctionWrapper(
            KeepChars(strict_gap + "".join(self.alphabet)))
        self.strip_bad = FunctionWrapper(KeepChars("".join(self.All)))
        to_keep = set(self.alphabet) ^ set(self.degenerates) - set(self.gaps)
        self.strip_bad_and_gaps = FunctionWrapper(KeepChars("".join(to_keep)))

        # make inverse degenerates from degenerates
        # ensure that lowercase versions also exist if appropriate
        inv_degens = {}
        for key, val in list(self.degenerates.items()):
            inv_degens[frozenset(val)] = key.upper()
            if add_lower:
                inv_degens[frozenset("".join(val).lower())] = key.lower()
        for m in self.alphabet:
            inv_degens[frozenset(m)] = m
            if add_lower:
                inv_degens[frozenset("".join(m).lower())] = m.lower()
        for m in self.gaps:
            inv_degens[frozenset(m)] = m
        self.inverse_degenerates = inv_degens

        # set array type for modeling alphabets
        try:
            self.array_type = self.alphabet.array_type
        except AttributeError:
            self.array_type = None

        # set modeling sequence
        self._make_array_seq = array_seq_constructor

        self._colors = colors or defaultdict(_DefaultValue("black"))
Beispiel #8
0
 def test_to_chars(self):
     """CharAlphabet to_chars should convert an input array to chars"""
     r = CharAlphabet("UCAG")
     c = r.to_chars(array([[0, 0, 1], [0, 3, 2]], "B"))
     assert_equal(c, array(["UUC", "UGA"], "c"))
Beispiel #9
0
 def test_from_array(self):
     """CharAlphabet from_array should return correct array"""
     r = CharAlphabet("UCAG")
     got = r.from_array(array(["UUC", "UGA"], "c"))
     assert_equal(got, array([[0, 0, 1], [0, 3, 2]], "B"))
Beispiel #10
0
 def test_from_string(self):
     """CharAlphabet from_string should return correct array"""
     r = CharAlphabet("UCAG")
     assert_equal(r.from_string("UUCUGA"), array([0, 0, 1, 0, 3, 2], "B"))
Beispiel #11
0
from cogent3.core.alphabet import (
    CharAlphabet,
    Enumeration,
    JointEnumeration,
    _make_complement_array,
    _make_translation_tables,
    array,
    get_array_type,
    uint8,
    uint16,
    uint32,
)
from cogent3.core.moltype import RNA, get_moltype

DnaBases = CharAlphabet("TCAG")
RnaBases = CharAlphabet("UCAG")
AminoAcids = CharAlphabet("ACDEFGHIKLMNPQRSTVWY")

__author__ = "Rob Knight, Peter Maxwell and Gavin Huttley"
__copyright__ = "Copyright 2007-2021, The Cogent Project"
__credits__ = ["Peter Maxwell", "Rob Knight", "Gavin Huttley"]
__license__ = "BSD-3"
__version__ = "2021.04.20a"
__maintainer__ = "Gavin Huttley"
__email__ = "*****@*****.**"
__status__ = "Production"


class translation_table_tests(TestCase):
    """Tests of top-level translation table functions"""