Exemple #1
0
    def test_keep_chars(self):
        """keep_chars returns a string containing only chars in keep"""
        f = keep_chars('ab c3*[')
        self.assertEqual(f(''), '')  #empty
        self.assertRaises(AttributeError, f, None)  #None

        #one character, case sensitive
        self.assertEqual(f('b'), 'b')
        self.assertEqual(f('g'), '')
        self.assertEqual(f('xyz123'), '3')
        self.assertEqual(f('xyz  123'), '  3')

        #more characters, case sensitive
        self.assertEqual(f('kjbwherzcagebcujrkcs'), 'bcabcc')
        self.assertEqual(f('f[ffff*ff*fff3fff'), '[**3')

        # case insensitive
        f = keep_chars('AbC', False)
        self.assertEqual(f('abcdef'), 'abc')
        self.assertEqual(f('ABCDEF'), 'ABC')
        self.assertEqual(f('aBcDeF'), 'aBc')
Exemple #2
0
    def test_keep_chars(self):
        """keep_chars returns a string containing only chars in keep"""
        f = keep_chars('ab c3*[')
        self.assertEqual(f(''),'') #empty
        self.assertRaises(AttributeError,f,None) #None
        
        #one character, case sensitive
        self.assertEqual(f('b'),'b')
        self.assertEqual(f('g'),'')
        self.assertEqual(f('xyz123'),'3')
        self.assertEqual(f('xyz  123'),'  3')
        
        #more characters, case sensitive
        self.assertEqual(f('kjbwherzcagebcujrkcs'),'bcabcc')
        self.assertEqual(f('f[ffff*ff*fff3fff'),'[**3')

        # case insensitive
        f = keep_chars('AbC',False)
        self.assertEqual(f('abcdef'),'abc')
        self.assertEqual(f('ABCDEF'),'ABC')
        self.assertEqual(f('aBcDeF'),'aBc')
Exemple #3
0
    def test_keep_chars(self):
        """keep_chars returns a string containing only chars in keep"""
        f = keep_chars("ab c3*[")
        self.assertEqual(f(""), "")  # empty
        self.assertRaises(AttributeError, f, None)  # None

        # one character, case sensitive
        self.assertEqual(f("b"), "b")
        self.assertEqual(f("g"), "")
        self.assertEqual(f("xyz123"), "3")
        self.assertEqual(f("xyz  123"), "  3")

        # more characters, case sensitive
        self.assertEqual(f("kjbwherzcagebcujrkcs"), "bcabcc")
        self.assertEqual(f("f[ffff*ff*fff3fff"), "[**3")

        # case insensitive
        f = keep_chars("AbC", False)
        self.assertEqual(f("abcdef"), "abc")
        self.assertEqual(f("ABCDEF"), "ABC")
        self.assertEqual(f("aBcDeF"), "aBc")
    def __init__(self, chars, name=None, invert_charset=False, strip_f=strip, default_char=None):

        """Returns new CharFilter object."""
        self.Chars = chars
        self.Name = name
        self.Invert = invert_charset
        if invert_charset:
            if default_char:
                trans_table = trans_all(chars, default_char)
                self.Filter = lambda s: s.translate(trans_table)
            else:
                self.Filter = exclude_chars(chars)
        else:
            if default_char:
                trans_table = trans_except(chars, default_char)
                self.Filter = lambda s: s.translate(trans_table)
            else:
                self.Filter = keep_chars(chars)
        self.stripF = strip_f
Exemple #5
0
 def __init__(self,
              chars,
              name=None,
              invert_charset=False,
              strip_f=strip,
              default_char=None):
     """Returns new CharFilter object."""
     self.Chars = chars
     self.Name = name
     self.Invert = invert_charset
     if invert_charset:
         if default_char:
             trans_table = trans_all(chars, default_char)
             self.Filter = lambda s: s.translate(trans_table)
         else:
             self.Filter = exclude_chars(chars)
     else:
         if default_char:
             trans_table = trans_except(chars, default_char)
             self.Filter = lambda s: s.translate(trans_table)
         else:
             self.Filter = keep_chars(chars)
     self.stripF = strip_f
Exemple #6
0
 def __init__(self, motifset, Gap=IUPAC_gap, Missing=IUPAC_missing,\
         Gaps=None,
         Sequence=None, Ambiguities=None,
         label=None, Complements=None, Pairs=None, MWCalculator=None, \
         add_lower=False, preserve_existing_moltypes=False, \
         make_alphabet_group=False, ModelSeq=None):
     """Returns a new MolType object. Note that the parameters are in flux.
     
     Currently:
         motifset: Alphabet or sequence of items in the default
             alphabet. Does not include degenerates.
         
         Gap: default gap symbol
         
         Missing: symbol for missing data
         
         Gaps: any other symbols that should be treated as gaps (doesn't have
               to include Gap or Missing; they will be silently added)
         
         Sequence: Class for constructing sequences.
         
         Ambiguities: dict of char:tuple, doesn't include gaps (these are
             hard-coded as - and ?, and added later.
         
         label: text label, don't know what this is used for. Unnecessary?
         
         Complements: dict of symbol:symbol showing how the non-degenerate
             single characters complement each other. Used for constructing
             on the fly the complement table, incl. support for mustPair and
             canPair.
         
         Pairs: dict in which keys are pairs of symbols that can pair
             with each other, values are True (must pair) or False (might
             pair). Currently, the meaning of GU pairs as 'weak' is conflated
             with the meaning of degenerate symbol pairs (which might pair
             with each other but don't necessarily, depending on how the
             symbol is resolved). This should be refactored.
         
         MWCalculator: f(seq) -> molecular weight.
         
         add_lower: if True (default: False) adds the lowercase versions of
             everything into the alphabet. Slated for deletion.
         
         preserve_existing_moltypes: if True (default: False), does not
         set the MolType of the things added in **kwargs to self.
         
         make_alphabet_group: if True, makes an AlphabetGroup relating
         the various alphabets to one another.
         
         ModelSeq: sequence type for modeling
     
     Note on "Degenerates" versus "Ambiguities": self.Degenerates contains
     _only_ mappings for degenerate symbols, whereas self.Ambiguities
     contains mappings for both degenerate and non-degenerate symbols.
     Sometimes you want one, sometimes the other, so both are provided.
     """
     self.Gap = Gap
     self.Missing = Missing
     self.Gaps = frozenset([Gap, Missing])
     if Gaps:
         self.Gaps = self.Gaps.union(frozenset(Gaps))
     self.label = label
     #set the sequence constructor
     if Sequence is None:
         Sequence = ''.join     #safe default string constructor
     elif not preserve_existing_moltypes:
         Sequence.MolType = self
     self.Sequence = Sequence
     
     #set the ambiguities
     ambigs = {self.Missing:tuple(motifset)+(self.Gap,),self.Gap:(self.Gap,)}
     if Ambiguities:
         ambigs.update(Ambiguities)
     for c in motifset:
         ambigs[c] = (c,)
     self.Ambiguities = ambigs
     
     #set Complements -- must set before we make the alphabet group
     self.Complements = Complements or {}
     
     if make_alphabet_group: #note: must use _original_ ambiguities here
         self.Alphabets = AlphabetGroup(motifset, Ambiguities, \
             MolType=self)
         self.Alphabet = self.Alphabets.Base
     else:
         if isinstance(motifset, Enumeration):
             self.Alphabet = motifset
         elif max(len(motif) for motif in motifset) == 1:
             self.Alphabet = CharAlphabet(motifset, MolType=self)
         else:
             self.Alphabet = Alphabet(motifset, MolType=self)
     #set the other properties
     self.Degenerates = Ambiguities and Ambiguities.copy() or {}
     self.Degenerates[self.Missing] = ''.join(motifset)+self.Gap
     self.Matches = make_matches(motifset, self.Gaps, self.Degenerates)
     self.Pairs = Pairs and Pairs.copy() or {}
     self.Pairs.update(make_pairs(Pairs, motifset, self.Gaps, \
         self.Degenerates))
     self.MWCalculator = MWCalculator
     #add lowercase characters, if we're doing that
     if add_lower:
         self._add_lowercase()
     #cache various other data that make the calculations faster
     self._make_all()
     self._make_comp_table()
     # a gap can be a true gap char or a degenerate character, typically '?'
     # we therefore want to ensure consistent treatment across the definition
     # of characters as either gap or degenerate
     self.GapString = ''.join(self.Gaps)
     strict_gap = "".join(set(self.GapString) - set(self.Degenerates))
     self.stripDegenerate = FunctionWrapper(
         keep_chars(strict_gap+''.join(self.Alphabet)))
     self.stripBad = FunctionWrapper(keep_chars(''.join(self.All)))
     to_keep = set(self.Alphabet) ^ set(self.Degenerates) - set(self.Gaps)
     self.stripBadAndGaps = FunctionWrapper(keep_chars(''.join(to_keep)))
     
     #make inverse degenerates from degenerates
     #ensure that lowercase versions also exist if appropriate
     inv_degens = {}
     for key, val in self.Degenerates.items():
         inv_degens[frozenset(val)] = key.upper()
         if add_lower:
             inv_degens[frozenset(''.join(val).lower())] = key.lower()
     for m in self.Alphabet:
         inv_degens[frozenset(m)] = m
         if add_lower:
             inv_degens[frozenset(''.join(m).lower())] = m.lower()
     for m in self.Gaps:
         inv_degens[frozenset(m)] = m
     self.InverseDegenerates = inv_degens
     
     #set array type for modeling alphabets
     try:
         self.ArrayType = self.Alphabet.ArrayType
     except AttributeError:
         self.ArrayType = None
     
     #set modeling sequence
     self.ModelSeq = ModelSeq
 def __init__(self, motifset, Gap=IUPAC_gap, Missing=IUPAC_missing,\
         Gaps=None,
         Sequence=None, Ambiguities=None,
         label=None, Complements=None, Pairs=None, MWCalculator=None, \
         add_lower=False, preserve_existing_moltypes=False, \
         make_alphabet_group=False, ModelSeq=None):
     """Returns a new MolType object. Note that the parameters are in flux.
     
     Currently:
         motifset: Alphabet or sequence of items in the default
             alphabet. Does not include degenerates.
         
         Gap: default gap symbol
         
         Missing: symbol for missing data
         
         Gaps: any other symbols that should be treated as gaps (doesn't have
               to include Gap or Missing; they will be silently added)
         
         Sequence: Class for constructing sequences.
         
         Ambiguities: dict of char:tuple, doesn't include gaps (these are
             hard-coded as - and ?, and added later.
         
         label: text label, don't know what this is used for. Unnecessary?
         
         Complements: dict of symbol:symbol showing how the non-degenerate
             single characters complement each other. Used for constructing
             on the fly the complement table, incl. support for mustPair and
             canPair.
         
         Pairs: dict in which keys are pairs of symbols that can pair
             with each other, values are True (must pair) or False (might
             pair). Currently, the meaning of GU pairs as 'weak' is conflated
             with the meaning of degenerate symbol pairs (which might pair
             with each other but don't necessarily, depending on how the
             symbol is resolved). This should be refactored.
         
         MWCalculator: f(seq) -> molecular weight.
         
         add_lower: if True (default: False) adds the lowercase versions of
             everything into the alphabet. Slated for deletion.
         
         preserve_existing_moltypes: if True (default: False), does not
         set the MolType of the things added in **kwargs to self.
         
         make_alphabet_group: if True, makes an AlphabetGroup relating
         the various alphabets to one another.
         
         ModelSeq: sequence type for modeling
     
     Note on "Degenerates" versus "Ambiguities": self.Degenerates contains
     _only_ mappings for degenerate symbols, whereas self.Ambiguities
     contains mappings for both degenerate and non-degenerate symbols.
     Sometimes you want one, sometimes the other, so both are provided.
     """
     self.Gap = Gap
     self.Missing = Missing
     self.Gaps = frozenset([Gap, Missing])
     if Gaps:
         self.Gaps = self.Gaps.union(frozenset(Gaps))
     self.label = label
     #set the sequence constructor
     if Sequence is None:
         Sequence = ''.join     #safe default string constructor
     elif not preserve_existing_moltypes:
         Sequence.MolType = self
     self.Sequence = Sequence
     
     #set the ambiguities
     ambigs = {self.Missing:tuple(motifset)+(self.Gap,),self.Gap:(self.Gap,)}
     if Ambiguities:
         ambigs.update(Ambiguities)
     for c in motifset:
         ambigs[c] = (c,)
     self.Ambiguities = ambigs
     
     #set Complements -- must set before we make the alphabet group
     self.Complements = Complements or {}
     
     if make_alphabet_group: #note: must use _original_ ambiguities here
         self.Alphabets = AlphabetGroup(motifset, Ambiguities, \
             MolType=self)
         self.Alphabet = self.Alphabets.Base
     else:
         if isinstance(motifset, Enumeration):
             self.Alphabet = motifset
         elif max(len(motif) for motif in motifset) == 1:
             self.Alphabet = CharAlphabet(motifset, MolType=self)
         else:
             self.Alphabet = Alphabet(motifset, MolType=self)
     #set the other properties
     self.Degenerates = Ambiguities and Ambiguities.copy() or {}
     self.Degenerates[self.Missing] = ''.join(motifset)+self.Gap
     self.Matches = make_matches(motifset, self.Gaps, self.Degenerates)
     self.Pairs = Pairs and Pairs.copy() or {}
     self.Pairs.update(make_pairs(Pairs, motifset, self.Gaps, \
         self.Degenerates))
     self.MWCalculator = MWCalculator
     #add lowercase characters, if we're doing that
     if add_lower:
         self._add_lowercase()
     #cache various other data that make the calculations faster
     self._make_all()
     self._make_comp_table()
     # a gap can be a true gap char or a degenerate character, typically '?'
     # we therefore want to ensure consistent treatment across the definition
     # of characters as either gap or degenerate
     self.GapString = ''.join(self.Gaps)
     strict_gap = "".join(set(self.GapString) - set(self.Degenerates))
     self.stripDegenerate = FunctionWrapper(
         keep_chars(strict_gap+''.join(self.Alphabet)))
     self.stripBad = FunctionWrapper(keep_chars(''.join(self.All)))
     to_keep = set(self.Alphabet) ^ set(self.Degenerates) - set(self.Gaps)
     self.stripBadAndGaps = FunctionWrapper(keep_chars(''.join(to_keep)))
     
     #make inverse degenerates from degenerates
     #ensure that lowercase versions also exist if appropriate
     inv_degens = {}
     for key, val in self.Degenerates.items():
         inv_degens[frozenset(val)] = key.upper()
         if add_lower:
             inv_degens[frozenset(''.join(val).lower())] = key.lower()
     for m in self.Alphabet:
         inv_degens[frozenset(m)] = m
         if add_lower:
             inv_degens[frozenset(''.join(m).lower())] = m.lower()
     for m in self.Gaps:
         inv_degens[frozenset(m)] = m
     self.InverseDegenerates = inv_degens
     
     #set array type for modeling alphabets
     try:
         self.ArrayType = self.Alphabet.ArrayType
     except AttributeError:
         self.ArrayType = None
     
     #set modeling sequence
     self.ModelSeq = ModelSeq