Esempio n. 1
0
 def ParseComplEsc(self):
     """Returns a CharClass, parsing the complement of a category escape."""
     if self.Match("\\P{"):
         self.SetPos(self.pos + 3)
         cClass = CharClass(self.ParseCharProp())
         if self.theChar == '}':
             self.NextChar()
             cClass.Negate()
             return cClass
     raise RegularExpressionError("Expected \\P{...} at [%i]" % self.pos)
Esempio n. 2
0
 def ParseComplEsc(self):
     """Returns a CharClass, parsing the complement of a category escape."""
     if self.match("\\P{"):
         self.setpos(self.pos + 3)
         cClass = CharClass(self.ParseCharProp())
         if self.the_char == '}':
             self.next_char()
             cClass.negate()
             return cClass
     raise RegularExpressionError("Expected \\P{...} at [%i]" % self.pos)
 def ParsePosCharGroup(self):
     """Returns a CharClass representing a positive range"""
     cClass = CharClass()
     nRanges = 0
     while True:
         savePos = self.pos
         if self.theChar == u"-" and nRanges:
             # This had better be the last hyphen in the posCharGroup
             if self.Match(u"-["):
                 # a subtraction
                 break
             elif self.Match(u"-]") or self.Match(u"--["):
                 cClass.AddChar(u"-")
                 self.NextChar()
                 break
             else:
                 # this is all wrong
                 raise RegularExpressionError(
                     "hyphen must be first or last character in posCharGroup [%i]"
                     % self.pos)
         try:
             cClass.AddClass(self.ParseCharRange())
             nRanges += 1
             continue
         except RegularExpressionError:
             self.SetPos(savePos)
             pass
         try:
             cClass.AddClass(self.ParseCharClassEsc())
             nRanges += 1
             continue
         except RegularExpressionError:
             if nRanges:
                 self.SetPos(savePos)
                 break
             else:
                 # We expected either a charRange or a charClassEsc
                 raise RegularExpressionError(
                     "Expected charRange or charClassEsc at [%i]" %
                     self.pos)
     return cClass
Esempio n. 4
0
 def ParseIsCategory(self):
     """Returns a CharClass corresponding to one of the character categories
     or raises an error."""
     if self.the_char in self.Categories:
         cat = self.the_char
         self.next_char()
         if self.the_char is not None and (cat + self.the_char) in self.Categories:
             cat = cat + self.the_char
             self.next_char()
         return CharClass.ucd_category(cat)
     else:
         raise RegularExpressionError(
             "Expected category name [%i]" % self.pos)
Esempio n. 5
0
 def ParseCharRange(self):
     """Returns a CharClass representing this range."""
     savePos = self.pos
     try:
         cClass = self.ParseSERange()
     except RegularExpressionError:
         self.SetPos(savePos)
         if self.IsXmlCharIncDash():
             cClass = CharClass(self.theChar)
             self.NextChar()
         else:
             raise
     return cClass
Esempio n. 6
0
 def ParseIsCategory(self):
     """Returns a CharClass corresponding to one of the character categories
     or raises an error."""
     if self.theChar in self.Categories:
         cat = self.theChar
         self.NextChar()
         if self.theChar is not None and (cat + self.theChar) in self.Categories:
             cat = cat + self.theChar
             self.NextChar()
         return CharClass.UCDCategory(cat)
     else:
         raise RegularExpressionError(
             "Expected category name [%i]" % self.pos)
Esempio n. 7
0
 def ParseSERange(self):
     """Returns a CharClass representing this range."""
     s = self.ParseCharOrEsc()
     if self.theChar == u"-":
         self.NextChar()
     else:
         raise RegularExpressionError(
             "Expected '-' in seRange [%i]" % self.pos)
     e = self.ParseCharOrEsc()
     if ord(s) > ord(e):
         raise RegularExpressionError(
             "Empty SERange: %s-%s" % (repr(s), repr(e)))
     return CharClass((s, e))
Esempio n. 8
0
 def ParseIsBlock(self):
     """Returns a CharClass corresponding to one of the Unicode blocks."""
     block = []
     while self.IsBlockClass.test(self.the_char):
         block.append(self.the_char)
         self.next_char()
     block = string.join(block, '')
     if block.startswith("Is"):
         try:
             return CharClass.ucd_block(block[2:])
         except KeyError:
             raise RegularExpressionError(
                 "Invalid IsBlock name: %s" % block[2:])
     else:
         raise RegularExpressionError("Expected IsBlock [%i]" % self.pos)
Esempio n. 9
0
 def ParseIsBlock(self):
     """Returns a CharClass corresponding to one of the Unicode blocks."""
     block = []
     while self.IsBlockClass.Test(self.theChar):
         block.append(self.theChar)
         self.NextChar()
     block = string.join(block, '')
     if block.startswith("Is"):
         try:
             return CharClass.UCDBlock(block[2:])
         except KeyError:
             raise RegularExpressionError(
                 "Invalid IsBlock name: %s" % block[2:])
     else:
         raise RegularExpressionError("Expected IsBlock [%i]" % self.pos)
Esempio n. 10
0
 def ParseCharClassEsc(self):
     """Returns a CharClass instance representing one of the escape sequences."""
     if self.Match(u"\\p"):
         cClass = self.ParseCatEsc()
     elif self.Match(u"\\P"):
         cClass = self.ParseComplEsc()
     elif self.theChar == u"\\":
         try:
             savePos = self.pos
             cClass = self.ParseMultiCharEsc()
         except RegularExpressionError:
             self.SetPos(savePos)
             cClass = CharClass(self.ParseSingleCharEsc())
     else:
         raise RegularExpressionError(
             "Expected charClassEsc at [%i]" % self.pos)
     return cClass
Esempio n. 11
0
 def ParsePosCharGroup(self):
     """Returns a CharClass representing a positive range"""
     cClass = CharClass()
     nRanges = 0
     while True:
         savepos = self.pos
         if self.the_char == u"-" and nRanges:
             # This had better be the last hyphen in the posCharGroup
             if self.match(u"-["):
                 # a subtraction
                 break
             elif self.match(u"-]") or self.match(u"--["):
                 cClass.add_char(u"-")
                 self.next_char()
                 break
             else:
                 # this is all wrong
                 raise RegularExpressionError(
                     "hyphen must be first or last character in posCharGroup [%i]" % self.pos)
         try:
             cClass.add_class(self.ParseCharRange())
             nRanges += 1
             continue
         except RegularExpressionError:
             self.setpos(savepos)
             pass
         try:
             cClass.add_class(self.ParseCharClassEsc())
             nRanges += 1
             continue
         except RegularExpressionError:
             if nRanges:
                 self.setpos(savepos)
                 break
             else:
                 # We expected either a charRange or a charClassEsc
                 raise RegularExpressionError(
                     "Expected charRange or charClassEsc at [%i]" % self.pos)
     return cClass
Esempio n. 12
0
        """A convenience function, returns True if the expression matches *target*."""
        m = self.p.match(target)
        if m is None or m.end(0) < len(target):
            # print "No match"
            return 0
        else:
            # print "match"
            return 1


class RegularExpressionError(Exception):
    pass


sClass = CharClass(u'\x09', u'\x0A', u'\x0D', u' ')
SClass = CharClass(sClass)
SClass.negate()
iClass = CharClass(LetterCharClass, u'_', u':')
IClass = CharClass(iClass)
IClass.negate()
CClass = CharClass(NameCharClass)
CClass.negate()
dClass = CharClass.ucd_category('Nd')
DClass = CharClass(dClass)
DClass.negate()
WClass = CharClass(CharClass.ucd_category(
    'P'), CharClass.ucd_category('Z'), CharClass.ucd_category('C'))
wClass = CharClass(WClass)
wClass.negate()

Esempio n. 13
0
class RegularExpressionParser(BasicParser):

    """A custom parser for XML schema regular expressions.

    The parser is initialised from a source string, the string to be parsed."""

    def _REEscape(self, c=None):
        if c is None:
            c == self.theChar
        if c in u".^$*+?{}\\[]|()":
            return "\\" + c
        else:
            return c

    def ParseRegularExpression(self):
        return self.src

    def ParseRegExp(self):
        """Returns a unicode string representing the regular expression."""
        result = []
        while True:
            # expression ends at the end of the string or at a closing bracket
            result.append(self.ParseBranch())
            if self.theChar == u"|":
                self.NextChar()
                result.append(u"|")
                continue
            else:
                break
        return string.join(result, '')

    def ParseBranch(self):
        """Returns a unicode string representing this piece as a python regular expression."""
        result = []
        while self.IsChar() or self.MatchOne(u".\\[("):
            result.append(self.ParsePiece())
        return string.join(result, '')

    def ParsePiece(self):
        result = self.ParseAtom()
        if self.MatchOne("?*+{"):
            n, m = self.ParseQuantifier()
            if n == 0:
                if m is None:
                    return result + u"*"
                elif m == 0:
                    return u""
                elif m == 1:
                    return result + u"?"
                else:
                    return "%s{,%i}" % (result, m)
            elif n == 1:
                if m is None:
                    return result + u"+"
                elif m == 1:
                    return result
                else:
                    return "%s{1,%i}" % (result, m)
            elif m is None:
                return "%s{%i,}" % (result, n)
            elif n == m:
                return "%s{%i}" % (result, n)
            else:
                return "%s{%i,%i}" % (result, n, m)
        else:
            return result

    def ParseQuantifier(self):
        """Returns a tuple of n,m.

        Symbolic values are expanded to the appropriate pair.  The second
        value may be None indicating unbounded."""
        if self.theChar == u"?":
            self.NextChar()
            return 0, 1
        elif self.theChar == u"*":
            self.NextChar()
            return 0, None
        elif self.theChar == u"+":
            self.NextChar()
            return 1, None
        elif self.theChar == u"{":
            self.NextChar()
            result = self.ParseQuantity()
            if self.theChar == u"}":
                self.NextChar()
                return result
            else:
                raise RegularExpressionError("Expected } at [%i]" % self.pos)
        else:
            raise RegularExpressionError(
                "Expected quantifier at [%i]" % self.pos)

    def ParseQuantity(self):
        """Returns a tuple of n,m even if an exact quantity is given.

        In other words, the exact quantity 'n' returns n,n.  The second
        value may be None indicated unbounded."""
        n = self.ParseQuantExact()
        m = None
        if self.theChar == u",":
            self.NextChar()
            if self.MatchOne(u"0123456789"):
                m = self.ParseQuantExact()
                if n > m:
                    raise RegularExpressionError(
                        "Illegal quantity: {%i,%i}" % (n, m))
        else:
            m = n
        return n, m

    def ParseQuantExact(self):
        """Returns an integer."""
        result = 0
        nDigits = 0
        while self.MatchOne(u"0123456789"):
            result = result * 10 + ord(self.theChar) - 0x30
            self.NextChar()
            nDigits += 1
        if nDigits == 0:
            raise RegularExpressionError("Expected digit at [%i]" % self.pos)
        return result

    def ParseAtom(self):
        """Returns a unicode string representing this atom as a python regular expression."""
        if self.IsChar():
            result = self._REEscape(self.theChar)
            self.NextChar()
        elif self.theChar == "(":
            # a regular expression
            self.NextChar()
            result = "(%s)" % self.ParseRegExp()
            if self.theChar != ")":
                raise RegularExpressionError("Expected ) at [%i]" % self.pos)
            self.NextChar()
        else:
            cClass = self.ParseCharClass()
            result = unicode(cClass)
        return result

    def IsChar(self, c=None):
        """The definition of this function is designed to be conservative with
        respect to the specification, which is clearly in error around
        production [10] as the prose and the BNF do not match.  It appears that
        | was intended to be excluded in the prose but has been omitted, the
        reverse being true for the curly-brackets."""
        if c is None:
            c = self.theChar
        if c is None or c in ".\\?*+{}()[]|":
            return False
        else:
            return True

    def ParseCharClass(self):
        """Returns a CharClass instance representing this class."""
        if self.theChar == u"[":
            return self.ParseCharClassExpr()
        elif self.theChar == u"\\":
            return self.ParseCharClassEsc()
        elif self.theChar == u".":
            return self.ParseWildcardEsc()
        else:
            raise RegularExpressionError(
                "Expected [, \\ or . at [%i]" % self.pos)

    def ParseCharClassExpr(self):
        """Returns a CharClass instance representing this class expression."""
        if self.theChar == "[":
            self.NextChar()
            cClass = self.ParseCharGroup()
            if self.theChar == "]":
                self.NextChar()
                return cClass
            else:
                raise RegularExpressionError("Expected ] at [%i]" % self.pos)
        else:
            raise RegularExpressionError("Expected [ at [%i]" % self.pos)

    def ParseCharGroup(self):
        """Returns a CharClass representing this group.  This method also
        handles the case of a class subtraction directly to reduce the need for
        look-ahead.  If you specifically want to parse a subtraction you can do
        this with :py:meth:`ParseCharClassSub`."""
        if self.theChar == u"^":
            cClass = self.ParseNegCharGroup()
        else:
            cClass = self.ParsePosCharGroup()
        if self.theChar == u"-":
            self.NextChar()
            subClass = self.ParseCharClassExpr()
            cClass.SubtractClass(subClass)
        return cClass

    def ParsePosCharGroup(self):
        """Returns a CharClass representing a positive range"""
        cClass = CharClass()
        nRanges = 0
        while True:
            savePos = self.pos
            if self.theChar == u"-" and nRanges:
                # This had better be the last hyphen in the posCharGroup
                if self.Match(u"-["):
                    # a subtraction
                    break
                elif self.Match(u"-]") or self.Match(u"--["):
                    cClass.AddChar(u"-")
                    self.NextChar()
                    break
                else:
                    # this is all wrong
                    raise RegularExpressionError(
                        "hyphen must be first or last character in posCharGroup [%i]" % self.pos)
            try:
                cClass.AddClass(self.ParseCharRange())
                nRanges += 1
                continue
            except RegularExpressionError:
                self.SetPos(savePos)
                pass
            try:
                cClass.AddClass(self.ParseCharClassEsc())
                nRanges += 1
                continue
            except RegularExpressionError:
                if nRanges:
                    self.SetPos(savePos)
                    break
                else:
                    # We expected either a charRange or a charClassEsc
                    raise RegularExpressionError(
                        "Expected charRange or charClassEsc at [%i]" % self.pos)
        return cClass

    def ParseNegCharGroup(self):
        """Returns a CharClass representing this range."""
        if self.theChar == u"^":
            # we have a negative range
            self.NextChar()
            cClass = self.ParsePosCharGroup()
            cClass.Negate()
            return cClass
        else:
            raise RegularExpressionError(
                "Expected negCharGroup at [%i]" % self.pos)

    def ParseCharClassSub(self):
        """Returns a CharClass representing this range - this method is not
        normally used by the parser as in present for completeness.  See
        :py:meth:`ParseCharGroup`."""
        if self.theChar == u"^":
            cClass = self.ParseNegCharGroup()
        else:
            cClass = self.ParsePosCharGroup()
        if self.theChar == u"-":
            self.NextChar()
            subClass = self.ParseCharClassExpr()
            cClass.SubtractClass(subClass)
            return cClass
        else:
            raise RegularExpressionError("Expected - at [%i]" % self.pos)

    def ParseCharRange(self):
        """Returns a CharClass representing this range."""
        savePos = self.pos
        try:
            cClass = self.ParseSERange()
        except RegularExpressionError:
            self.SetPos(savePos)
            if self.IsXmlCharIncDash():
                cClass = CharClass(self.theChar)
                self.NextChar()
            else:
                raise
        return cClass

    def ParseSERange(self):
        """Returns a CharClass representing this range."""
        s = self.ParseCharOrEsc()
        if self.theChar == u"-":
            self.NextChar()
        else:
            raise RegularExpressionError(
                "Expected '-' in seRange [%i]" % self.pos)
        e = self.ParseCharOrEsc()
        if ord(s) > ord(e):
            raise RegularExpressionError(
                "Empty SERange: %s-%s" % (repr(s), repr(e)))
        return CharClass((s, e))

    def ParseCharOrEsc(self):
        """Returns a single unicode character."""
        if self.IsXmlChar():
            result = self.theChar
            self.NextChar()
            return result
        else:
            return self.ParseSingleCharEsc()

    def IsXmlChar(self, c=None):
        if c is None:
            c = self.theChar
        return c is not None and c not in "\\-[]"

    def IsXmlCharIncDash(self, c=None):
        if c is None:
            c = self.theChar
        return c is not None and c not in "\\[]"

    def ParseCharClassEsc(self):
        """Returns a CharClass instance representing one of the escape sequences."""
        if self.Match(u"\\p"):
            cClass = self.ParseCatEsc()
        elif self.Match(u"\\P"):
            cClass = self.ParseComplEsc()
        elif self.theChar == u"\\":
            try:
                savePos = self.pos
                cClass = self.ParseMultiCharEsc()
            except RegularExpressionError:
                self.SetPos(savePos)
                cClass = CharClass(self.ParseSingleCharEsc())
        else:
            raise RegularExpressionError(
                "Expected charClassEsc at [%i]" % self.pos)
        return cClass

    SingleCharEscapes = {
        u'n': unichr(0x0A),
        u'r': unichr(0x0D),
        u't': unichr(0x09),
        u'\\': u'\\',
        u'|': u'|',
        u'.': u'.',
        u'-': u'-',
        u'^': u'^',
        u'?': u'?',
        u'*': u'*',
        u'+': u'+',
        u'{': u'{',
        u'}': u'}',
        u'(': u'(',
        u')': u')',
        u'[': u'[',
        u']': u']'
    }

    def ParseSingleCharEsc(self):
        """Returns a single unicode character parsed from a single char escape."""
        if self.theChar == u"\\":
            self.NextChar()
            if self.theChar in self.SingleCharEscapes:
                result = self.SingleCharEscapes[self.theChar]
                self.NextChar()
                return result
        raise RegularExpressionError(
            "Expected single character escape at [%i]" % self.pos)

    def ParseCatEsc(self):
        """Returns a CharClass, parsing a category escape."""
        if self.Match("\\p{"):
            self.SetPos(self.pos + 3)
            cClass = self.ParseCharProp()
            if self.theChar == '}':
                self.NextChar()
                return cClass
        raise RegularExpressionError("Expected \\p{...} at [%i]" % self.pos)

    def ParseComplEsc(self):
        """Returns a CharClass, parsing the complement of a category escape."""
        if self.Match("\\P{"):
            self.SetPos(self.pos + 3)
            cClass = CharClass(self.ParseCharProp())
            if self.theChar == '}':
                self.NextChar()
                cClass.Negate()
                return cClass
        raise RegularExpressionError("Expected \\P{...} at [%i]" % self.pos)

    def ParseCharProp(self):
        """Returns a CharClass, parsing an IsCategory or IsBlock."""
        savePos = self.pos
        try:
            cClass = self.ParseIsCategory()
        except RegularExpressionError:
            self.SetPos(savePos)
            cClass = self.ParseIsBlock()
        return cClass

    Categories = {
        u"L": True,
        u"Lu": True,
        u"Ll": True,
        u"Lt": True,
        u"Lm": True,
        u"Lo": True,
        u"M": True,
        u"Mn": True,
        u"Mc": True,
        u"Me": True,
        u"N": True,
        u"Nd": True,
        u"Nl": True,
        u"No": True,
        u"P": True,
        u"Pc": True,
        u"Pd": True,
        u"Ps": True,
        u"Pe": True,
        u"Pi": True,
        u"Pf": True,
        u"Po": True,
        u"Z": True,
        u"Zs": True,
        u"Zl": True,
        u"Zp": True,
        u"S": True,
        u"Sm": True,
        u"Sc": True,
        u"Sk": True,
        u"So": True,
        u"C": True,
        u"Cc": True,
        u"Cf": True,
        u"Co": True,
        u"Cn": True
    }

    def ParseIsCategory(self):
        """Returns a CharClass corresponding to one of the character categories
        or raises an error."""
        if self.theChar in self.Categories:
            cat = self.theChar
            self.NextChar()
            if self.theChar is not None and (cat + self.theChar) in self.Categories:
                cat = cat + self.theChar
                self.NextChar()
            return CharClass.UCDCategory(cat)
        else:
            raise RegularExpressionError(
                "Expected category name [%i]" % self.pos)

    IsBlockClass = CharClass(
        (u'a', u'z'),
        (u'A', u'Z'),
        (u'0', u'9'),
        u'-')

    def ParseIsBlock(self):
        """Returns a CharClass corresponding to one of the Unicode blocks."""
        block = []
        while self.IsBlockClass.Test(self.theChar):
            block.append(self.theChar)
            self.NextChar()
        block = string.join(block, '')
        if block.startswith("Is"):
            try:
                return CharClass.UCDBlock(block[2:])
            except KeyError:
                raise RegularExpressionError(
                    "Invalid IsBlock name: %s" % block[2:])
        else:
            raise RegularExpressionError("Expected IsBlock [%i]" % self.pos)

    MultiCharEscapes = {
        's': sClass,
        'S': SClass,
        'i': iClass,
        'I': IClass,
        'c': NameCharClass,
        'C': CClass,
        'd': dClass,
        'D': DClass,
        'w': wClass,
        'W': WClass
    }

    def ParseMultiCharEsc(self):
        """Returns a CharClass corresponding to one of the multichar escapes, if parsed."""
        if self.theChar == u"\\":
            self.NextChar()
            try:
                result = self.MultiCharEscapes[self.theChar]
                self.NextChar()
                return result
            except KeyError:
                # unknown escape
                raise RegularExpressionError(
                    "Unknown multichar escape at [%i], \\%s" % (self.pos, repr(self.theChar)))
        else:
            raise RegularExpressionError("Expected '\\' at [%i]" % self.pos)

    DotClass = CharClass(
        (unichr(0), unichr(9)),
        (unichr(11), unichr(12)),
        (unichr(14), unichr(maxunicode)))

    def ParseWildcardEsc(self):
        """Returns a CharClass corresponding to the wildcard '.' character if parsed."""
        if self.theChar == u".":
            self.NextChar()
            return self.DotClass
        else:
            raise RegularExpressionError("Expected '.' at [%i]" % self.pos)
Esempio n. 14
0
    def Match(self, target):
        """A convenience function, returns True if the expression matches *target*."""
        m = self.p.match(target)
        if m is None or m.end(0) < len(target):
            # print "No Match"
            return 0
        else:
            # print "Match"
            return 1


class RegularExpressionError(Exception):
    pass


sClass = CharClass(u'\x09', u'\x0A', u'\x0D', u' ')
SClass = CharClass(sClass)
SClass.Negate()
iClass = CharClass(LetterCharClass, u'_', u':')
IClass = CharClass(iClass)
IClass.Negate()
CClass = CharClass(NameCharClass)
CClass.Negate()
dClass = CharClass.UCDCategory('Nd')
DClass = CharClass(dClass)
DClass.Negate()
WClass = CharClass(CharClass.UCDCategory(
    'P'), CharClass.UCDCategory('Z'), CharClass.UCDCategory('C'))
wClass = CharClass(WClass)
wClass.Negate()