def ParseComplEsc(self): """Returns a CharClass, parsing the complement of a category escape.""" if self.Match("\\P{"): self.SetPos(self.pos + 3) cClass = CharClass(self.ParseCharProp()) if self.theChar == '}': self.NextChar() cClass.Negate() return cClass raise RegularExpressionError("Expected \\P{...} at [%i]" % self.pos)
def ParseCharRange(self): """Returns a CharClass representing this range.""" savePos = self.pos try: cClass = self.ParseSERange() except RegularExpressionError: self.SetPos(savePos) if self.IsXmlCharIncDash(): cClass = CharClass(self.theChar) self.NextChar() else: raise return cClass
def ParseSERange(self): """Returns a CharClass representing this range.""" s = self.ParseCharOrEsc() if self.theChar == u"-": self.NextChar() else: raise RegularExpressionError( "Expected '-' in seRange [%i]" % self.pos) e = self.ParseCharOrEsc() if ord(s) > ord(e): raise RegularExpressionError( "Empty SERange: %s-%s" % (repr(s), repr(e))) return CharClass((s, e))
def ParseCharClassEsc(self): """Returns a CharClass instance representing one of the escape sequences.""" if self.Match(u"\\p"): cClass = self.ParseCatEsc() elif self.Match(u"\\P"): cClass = self.ParseComplEsc() elif self.theChar == u"\\": try: savePos = self.pos cClass = self.ParseMultiCharEsc() except RegularExpressionError: self.SetPos(savePos) cClass = CharClass(self.ParseSingleCharEsc()) else: raise RegularExpressionError( "Expected charClassEsc at [%i]" % self.pos) return cClass
def ParsePosCharGroup(self): """Returns a CharClass representing a positive range""" cClass = CharClass() nRanges = 0 while True: savePos = self.pos if self.theChar == u"-" and nRanges: # This had better be the last hyphen in the posCharGroup if self.Match(u"-["): # a subtraction break elif self.Match(u"-]") or self.Match(u"--["): cClass.AddChar(u"-") self.NextChar() break else: # this is all wrong raise RegularExpressionError( "hyphen must be first or last character in posCharGroup [%i]" % self.pos) try: cClass.AddClass(self.ParseCharRange()) nRanges += 1 continue except RegularExpressionError: self.SetPos(savePos) pass try: cClass.AddClass(self.ParseCharClassEsc()) nRanges += 1 continue except RegularExpressionError: if nRanges: self.SetPos(savePos) break else: # We expected either a charRange or a charClassEsc raise RegularExpressionError( "Expected charRange or charClassEsc at [%i]" % self.pos) return cClass
class RegularExpressionParser(BasicParser): """A custom parser for XML schema regular expressions. The parser is initialised from a source string, the string to be parsed.""" def _REEscape(self, c=None): if c is None: c == self.theChar if c in u".^$*+?{}\\[]|()": return "\\" + c else: return c def ParseRegularExpression(self): return self.src def ParseRegExp(self): """Returns a unicode string representing the regular expression.""" result = [] while True: # expression ends at the end of the string or at a closing bracket result.append(self.ParseBranch()) if self.theChar == u"|": self.NextChar() result.append(u"|") continue else: break return string.join(result, '') def ParseBranch(self): """Returns a unicode string representing this piece as a python regular expression.""" result = [] while self.IsChar() or self.MatchOne(u".\\[("): result.append(self.ParsePiece()) return string.join(result, '') def ParsePiece(self): result = self.ParseAtom() if self.MatchOne("?*+{"): n, m = self.ParseQuantifier() if n == 0: if m is None: return result + u"*" elif m == 0: return u"" elif m == 1: return result + u"?" else: return "%s{,%i}" % (result, m) elif n == 1: if m is None: return result + u"+" elif m == 1: return result else: return "%s{1,%i}" % (result, m) elif m is None: return "%s{%i,}" % (result, n) elif n == m: return "%s{%i}" % (result, n) else: return "%s{%i,%i}" % (result, n, m) else: return result def ParseQuantifier(self): """Returns a tuple of n,m. Symbolic values are expanded to the appropriate pair. The second value may be None indicating unbounded.""" if self.theChar == u"?": self.NextChar() return 0, 1 elif self.theChar == u"*": self.NextChar() return 0, None elif self.theChar == u"+": self.NextChar() return 1, None elif self.theChar == u"{": self.NextChar() result = self.ParseQuantity() if self.theChar == u"}": self.NextChar() return result else: raise RegularExpressionError("Expected } at [%i]" % self.pos) else: raise RegularExpressionError( "Expected quantifier at [%i]" % self.pos) def ParseQuantity(self): """Returns a tuple of n,m even if an exact quantity is given. In other words, the exact quantity 'n' returns n,n. The second value may be None indicated unbounded.""" n = self.ParseQuantExact() m = None if self.theChar == u",": self.NextChar() if self.MatchOne(u"0123456789"): m = self.ParseQuantExact() if n > m: raise RegularExpressionError( "Illegal quantity: {%i,%i}" % (n, m)) else: m = n return n, m def ParseQuantExact(self): """Returns an integer.""" result = 0 nDigits = 0 while self.MatchOne(u"0123456789"): result = result * 10 + ord(self.theChar) - 0x30 self.NextChar() nDigits += 1 if nDigits == 0: raise RegularExpressionError("Expected digit at [%i]" % self.pos) return result def ParseAtom(self): """Returns a unicode string representing this atom as a python regular expression.""" if self.IsChar(): result = self._REEscape(self.theChar) self.NextChar() elif self.theChar == "(": # a regular expression self.NextChar() result = "(%s)" % self.ParseRegExp() if self.theChar != ")": raise RegularExpressionError("Expected ) at [%i]" % self.pos) self.NextChar() else: cClass = self.ParseCharClass() result = unicode(cClass) return result def IsChar(self, c=None): """The definition of this function is designed to be conservative with respect to the specification, which is clearly in error around production [10] as the prose and the BNF do not match. It appears that | was intended to be excluded in the prose but has been omitted, the reverse being true for the curly-brackets.""" if c is None: c = self.theChar if c is None or c in ".\\?*+{}()[]|": return False else: return True def ParseCharClass(self): """Returns a CharClass instance representing this class.""" if self.theChar == u"[": return self.ParseCharClassExpr() elif self.theChar == u"\\": return self.ParseCharClassEsc() elif self.theChar == u".": return self.ParseWildcardEsc() else: raise RegularExpressionError( "Expected [, \\ or . at [%i]" % self.pos) def ParseCharClassExpr(self): """Returns a CharClass instance representing this class expression.""" if self.theChar == "[": self.NextChar() cClass = self.ParseCharGroup() if self.theChar == "]": self.NextChar() return cClass else: raise RegularExpressionError("Expected ] at [%i]" % self.pos) else: raise RegularExpressionError("Expected [ at [%i]" % self.pos) def ParseCharGroup(self): """Returns a CharClass representing this group. This method also handles the case of a class subtraction directly to reduce the need for look-ahead. If you specifically want to parse a subtraction you can do this with :py:meth:`ParseCharClassSub`.""" if self.theChar == u"^": cClass = self.ParseNegCharGroup() else: cClass = self.ParsePosCharGroup() if self.theChar == u"-": self.NextChar() subClass = self.ParseCharClassExpr() cClass.SubtractClass(subClass) return cClass def ParsePosCharGroup(self): """Returns a CharClass representing a positive range""" cClass = CharClass() nRanges = 0 while True: savePos = self.pos if self.theChar == u"-" and nRanges: # This had better be the last hyphen in the posCharGroup if self.Match(u"-["): # a subtraction break elif self.Match(u"-]") or self.Match(u"--["): cClass.AddChar(u"-") self.NextChar() break else: # this is all wrong raise RegularExpressionError( "hyphen must be first or last character in posCharGroup [%i]" % self.pos) try: cClass.AddClass(self.ParseCharRange()) nRanges += 1 continue except RegularExpressionError: self.SetPos(savePos) pass try: cClass.AddClass(self.ParseCharClassEsc()) nRanges += 1 continue except RegularExpressionError: if nRanges: self.SetPos(savePos) break else: # We expected either a charRange or a charClassEsc raise RegularExpressionError( "Expected charRange or charClassEsc at [%i]" % self.pos) return cClass def ParseNegCharGroup(self): """Returns a CharClass representing this range.""" if self.theChar == u"^": # we have a negative range self.NextChar() cClass = self.ParsePosCharGroup() cClass.Negate() return cClass else: raise RegularExpressionError( "Expected negCharGroup at [%i]" % self.pos) def ParseCharClassSub(self): """Returns a CharClass representing this range - this method is not normally used by the parser as in present for completeness. See :py:meth:`ParseCharGroup`.""" if self.theChar == u"^": cClass = self.ParseNegCharGroup() else: cClass = self.ParsePosCharGroup() if self.theChar == u"-": self.NextChar() subClass = self.ParseCharClassExpr() cClass.SubtractClass(subClass) return cClass else: raise RegularExpressionError("Expected - at [%i]" % self.pos) def ParseCharRange(self): """Returns a CharClass representing this range.""" savePos = self.pos try: cClass = self.ParseSERange() except RegularExpressionError: self.SetPos(savePos) if self.IsXmlCharIncDash(): cClass = CharClass(self.theChar) self.NextChar() else: raise return cClass def ParseSERange(self): """Returns a CharClass representing this range.""" s = self.ParseCharOrEsc() if self.theChar == u"-": self.NextChar() else: raise RegularExpressionError( "Expected '-' in seRange [%i]" % self.pos) e = self.ParseCharOrEsc() if ord(s) > ord(e): raise RegularExpressionError( "Empty SERange: %s-%s" % (repr(s), repr(e))) return CharClass((s, e)) def ParseCharOrEsc(self): """Returns a single unicode character.""" if self.IsXmlChar(): result = self.theChar self.NextChar() return result else: return self.ParseSingleCharEsc() def IsXmlChar(self, c=None): if c is None: c = self.theChar return c is not None and c not in "\\-[]" def IsXmlCharIncDash(self, c=None): if c is None: c = self.theChar return c is not None and c not in "\\[]" def ParseCharClassEsc(self): """Returns a CharClass instance representing one of the escape sequences.""" if self.Match(u"\\p"): cClass = self.ParseCatEsc() elif self.Match(u"\\P"): cClass = self.ParseComplEsc() elif self.theChar == u"\\": try: savePos = self.pos cClass = self.ParseMultiCharEsc() except RegularExpressionError: self.SetPos(savePos) cClass = CharClass(self.ParseSingleCharEsc()) else: raise RegularExpressionError( "Expected charClassEsc at [%i]" % self.pos) return cClass SingleCharEscapes = { u'n': unichr(0x0A), u'r': unichr(0x0D), u't': unichr(0x09), u'\\': u'\\', u'|': u'|', u'.': u'.', u'-': u'-', u'^': u'^', u'?': u'?', u'*': u'*', u'+': u'+', u'{': u'{', u'}': u'}', u'(': u'(', u')': u')', u'[': u'[', u']': u']' } def ParseSingleCharEsc(self): """Returns a single unicode character parsed from a single char escape.""" if self.theChar == u"\\": self.NextChar() if self.theChar in self.SingleCharEscapes: result = self.SingleCharEscapes[self.theChar] self.NextChar() return result raise RegularExpressionError( "Expected single character escape at [%i]" % self.pos) def ParseCatEsc(self): """Returns a CharClass, parsing a category escape.""" if self.Match("\\p{"): self.SetPos(self.pos + 3) cClass = self.ParseCharProp() if self.theChar == '}': self.NextChar() return cClass raise RegularExpressionError("Expected \\p{...} at [%i]" % self.pos) def ParseComplEsc(self): """Returns a CharClass, parsing the complement of a category escape.""" if self.Match("\\P{"): self.SetPos(self.pos + 3) cClass = CharClass(self.ParseCharProp()) if self.theChar == '}': self.NextChar() cClass.Negate() return cClass raise RegularExpressionError("Expected \\P{...} at [%i]" % self.pos) def ParseCharProp(self): """Returns a CharClass, parsing an IsCategory or IsBlock.""" savePos = self.pos try: cClass = self.ParseIsCategory() except RegularExpressionError: self.SetPos(savePos) cClass = self.ParseIsBlock() return cClass Categories = { u"L": True, u"Lu": True, u"Ll": True, u"Lt": True, u"Lm": True, u"Lo": True, u"M": True, u"Mn": True, u"Mc": True, u"Me": True, u"N": True, u"Nd": True, u"Nl": True, u"No": True, u"P": True, u"Pc": True, u"Pd": True, u"Ps": True, u"Pe": True, u"Pi": True, u"Pf": True, u"Po": True, u"Z": True, u"Zs": True, u"Zl": True, u"Zp": True, u"S": True, u"Sm": True, u"Sc": True, u"Sk": True, u"So": True, u"C": True, u"Cc": True, u"Cf": True, u"Co": True, u"Cn": True } def ParseIsCategory(self): """Returns a CharClass corresponding to one of the character categories or raises an error.""" if self.theChar in self.Categories: cat = self.theChar self.NextChar() if self.theChar is not None and (cat + self.theChar) in self.Categories: cat = cat + self.theChar self.NextChar() return CharClass.UCDCategory(cat) else: raise RegularExpressionError( "Expected category name [%i]" % self.pos) IsBlockClass = CharClass( (u'a', u'z'), (u'A', u'Z'), (u'0', u'9'), u'-') def ParseIsBlock(self): """Returns a CharClass corresponding to one of the Unicode blocks.""" block = [] while self.IsBlockClass.Test(self.theChar): block.append(self.theChar) self.NextChar() block = string.join(block, '') if block.startswith("Is"): try: return CharClass.UCDBlock(block[2:]) except KeyError: raise RegularExpressionError( "Invalid IsBlock name: %s" % block[2:]) else: raise RegularExpressionError("Expected IsBlock [%i]" % self.pos) MultiCharEscapes = { 's': sClass, 'S': SClass, 'i': iClass, 'I': IClass, 'c': NameCharClass, 'C': CClass, 'd': dClass, 'D': DClass, 'w': wClass, 'W': WClass } def ParseMultiCharEsc(self): """Returns a CharClass corresponding to one of the multichar escapes, if parsed.""" if self.theChar == u"\\": self.NextChar() try: result = self.MultiCharEscapes[self.theChar] self.NextChar() return result except KeyError: # unknown escape raise RegularExpressionError( "Unknown multichar escape at [%i], \\%s" % (self.pos, repr(self.theChar))) else: raise RegularExpressionError("Expected '\\' at [%i]" % self.pos) DotClass = CharClass( (unichr(0), unichr(9)), (unichr(11), unichr(12)), (unichr(14), unichr(maxunicode))) def ParseWildcardEsc(self): """Returns a CharClass corresponding to the wildcard '.' character if parsed.""" if self.theChar == u".": self.NextChar() return self.DotClass else: raise RegularExpressionError("Expected '.' at [%i]" % self.pos)
def Match(self, target): """A convenience function, returns True if the expression matches *target*.""" m = self.p.match(target) if m is None or m.end(0) < len(target): # print "No Match" return 0 else: # print "Match" return 1 class RegularExpressionError(Exception): pass sClass = CharClass(u'\x09', u'\x0A', u'\x0D', u' ') SClass = CharClass(sClass) SClass.Negate() iClass = CharClass(LetterCharClass, u'_', u':') IClass = CharClass(iClass) IClass.Negate() CClass = CharClass(NameCharClass) CClass.Negate() dClass = CharClass.UCDCategory('Nd') DClass = CharClass(dClass) DClass.Negate() WClass = CharClass(CharClass.UCDCategory( 'P'), CharClass.UCDCategory('Z'), CharClass.UCDCategory('C')) wClass = CharClass(WClass) wClass.Negate()