def __unichr (self, code_point): rv = six.unichr(code_point) if 0 == code_point: rv = six.u('x00') if code_point in self.__XMLtoPythonREEscapedCodepoints: rv = six.unichr(0x5c) + rv return rv
def __unichr(self, code_point): rv = six.unichr(code_point) if 0 == code_point: rv = six.u('x00') if code_point in self.__XMLtoPythonREEscapedCodepoints: rv = six.unichr(0x5c) + rv return rv
def asSingleCharacter(self): """If this set represents a single character, return it as its unicode string value. Otherwise return C{None}.""" if (2 != len(self.__codepoints)) or ( 1 < (self.__codepoints[1] - self.__codepoints[0])): return None return six.unichr(self.__codepoints[0])
def XMLToPython(pattern): """Convert the given pattern to the format required for Python regular expressions. @param pattern: A Unicode string defining a pattern consistent with U{XML regular expressions<http://www.w3.org/TR/xmlschema-2/index.html#regexs>}. @return: A Unicode string specifying a Python regular expression that matches the same language as C{pattern}.""" assert isinstance(pattern, six.text_type) new_pattern_elts = [] new_pattern_elts.append('^(') position = 0 while position < len(pattern): cg = MaybeMatchCharacterClass(pattern, position) if cg is None: ch = pattern[position] if ch == six.u('^') or ch == six.u('$'): # These characters have no special meaning in XSD. But they # match start and end of string in Python, so they have to # be escaped. new_pattern_elts.append(six.unichr(0x5c) + ch) else: new_pattern_elts.append(ch) position += 1 else: (cps, position) = cg new_pattern_elts.append(cps.asPattern()) new_pattern_elts.append(')$') return ''.join(new_pattern_elts)
def XMLToPython (pattern): """Convert the given pattern to the format required for Python regular expressions. @param pattern: A Unicode string defining a pattern consistent with U{XML regular expressions<http://www.w3.org/TR/xmlschema-2/index.html#regexs>}. @return: A Unicode string specifying a Python regular expression that matches the same language as C{pattern}.""" assert isinstance(pattern, six.text_type) new_pattern_elts = [] new_pattern_elts.append('^(') position = 0 while position < len(pattern): cg = MaybeMatchCharacterClass(pattern, position) if cg is None: ch = pattern[position] if ch == six.u('^') or ch == six.u('$'): # These characters have no special meaning in XSD. But they # match start and end of string in Python, so they have to # be escaped. new_pattern_elts.append(six.unichr(0x5c) + ch) else: new_pattern_elts.append(ch) position += 1 else: (cps, position) = cg new_pattern_elts.append(cps.asPattern()) new_pattern_elts.append(')$') return ''.join(new_pattern_elts)
def _InitializeAllEsc(): """Set the values in _AllEsc without introducing C{k} and C{v} into the module.""" _AllEsc.update({six.u('.'): pyxb.utils.unicode.WildcardEsc}) bs = six.unichr(0x5c) for k, v in six.iteritems(pyxb.utils.unicode.SingleCharEsc): _AllEsc[bs + six.text_type(k)] = v for k, v in six.iteritems(pyxb.utils.unicode.MultiCharEsc): _AllEsc[bs + six.text_type(k)] = v for k, v in six.iteritems(pyxb.utils.unicode.catEsc): _AllEsc[bs + six.text_type(k)] = v for k, v in six.iteritems(pyxb.utils.unicode.complEsc): _AllEsc[bs + six.text_type(k)] = v for k, v in six.iteritems(pyxb.utils.unicode.IsBlockEsc): _AllEsc[bs + six.text_type(k)] = v
def _InitializeAllEsc (): """Set the values in _AllEsc without introducing C{k} and C{v} into the module.""" _AllEsc.update({ six.u('.'): pyxb.utils.unicode.WildcardEsc }) bs = six.unichr(0x5c) for k, v in six.iteritems(pyxb.utils.unicode.SingleCharEsc): _AllEsc[bs + six.text_type(k)] = v for k, v in six.iteritems(pyxb.utils.unicode.MultiCharEsc): _AllEsc[bs + six.text_type(k)] = v for k, v in six.iteritems(pyxb.utils.unicode.catEsc): _AllEsc[bs + six.text_type(k)] = v for k, v in six.iteritems(pyxb.utils.unicode.complEsc): _AllEsc[bs + six.text_type(k)] = v for k, v in six.iteritems(pyxb.utils.unicode.IsBlockEsc): _AllEsc[bs + six.text_type(k)] = v
def _MatchPosCharGroup(text, position): '''Parse a U{posCharGroup<http://www.w3.org/TR/xmlschema-2/#nt-posCharGroup>} term. @return: A tuple C{(cps, fs, p)} where: - C{cps} is a L{pyxb.utils.unicode.CodePointSet} containing the code points associated with the group; - C{fs} is a C{bool} that is C{True} if the next character is the C{-} in a U{charClassSub<http://www.w3.org/TR/xmlschema-2/#nt-charClassSub>} and C{False} if the group is not part of a charClassSub; - C{p} is the text offset immediately following the closing brace. @raise RegularExpressionError: if the expression is syntactically invalid. ''' start_position = position # DASH is just some unique object, used as a marker. # It can't be unicode or a CodePointSet. class DashClass: pass DASH = DashClass() # We tokenize first, then go back and stick the ranges together. tokens = [] has_following_subtraction = False while True: if position >= len(text): raise RegularExpressionError( position, "Incomplete character class expression, missing closing ']'") ch = text[position] if ch == six.u('['): # Only allowed if this is a subtraction if not tokens or tokens[-1] is not DASH: raise RegularExpressionError( position, "'[' character not allowed in character class") has_following_subtraction = True # For a character class subtraction, the "-[" are not part of the # posCharGroup, so undo reading the dash tokens.pop() position = position - 1 break elif ch == six.u(']'): # End break elif ch == six.unichr(0x5c): # backslash cps, position = _MatchCharClassEsc(text, position) single_char = cps.asSingleCharacter() if single_char is not None: tokens.append(single_char) else: tokens.append(cps) elif ch == six.u('-'): # We need to distinguish between "-" and "\-". So we use # DASH for a plain "-", and u"-" for a "\-". tokens.append(DASH) position = position + 1 else: tokens.append(ch) position = position + 1 if not tokens: raise RegularExpressionError(position, "Empty character class not allowed") # At the start or end of the character group, a dash has to be a literal if tokens[0] is DASH: tokens[0] = six.u('-') if tokens[-1] is DASH: tokens[-1] = six.u('-') result_cps = pyxb.utils.unicode.CodePointSet() cur_token = 0 while cur_token < len(tokens): start = tokens[cur_token] if cur_token + 2 < len(tokens) and tokens[cur_token + 1] is DASH: end = tokens[cur_token + 2] if not isinstance(start, six.text_type) or not isinstance( end, six.text_type): if start is DASH or end is DASH: raise RegularExpressionError( start_position, 'Two dashes in a row is not allowed in the middle of a character class.' ) raise RegularExpressionError( start_position, 'Dashes must be surrounded by characters, not character class escapes. %r %r' % (start, end)) if start > end: raise RegularExpressionError( start_position, 'Character ranges must have the lowest character first') result_cps.add((ord(start), ord(end))) cur_token = cur_token + 3 else: if start is DASH: raise RegularExpressionError( start_position, 'Dash without an initial character') elif isinstance(start, six.text_type): result_cps.add(ord(start)) else: result_cps.extend(start) cur_token = cur_token + 1 return result_cps, has_following_subtraction, position
def _MatchPosCharGroup(text, position): '''Parse a U{posCharGroup<http://www.w3.org/TR/xmlschema-2/#nt-posCharGroup>} term. @return: A tuple C{(cps, fs, p)} where: - C{cps} is a L{pyxb.utils.unicode.CodePointSet} containing the code points associated with the group; - C{fs} is a C{bool} that is C{True} if the next character is the C{-} in a U{charClassSub<http://www.w3.org/TR/xmlschema-2/#nt-charClassSub>} and C{False} if the group is not part of a charClassSub; - C{p} is the text offset immediately following the closing brace. @raise RegularExpressionError: if the expression is syntactically invalid. ''' start_position = position # DASH is just some unique object, used as a marker. # It can't be unicode or a CodePointSet. class DashClass: pass DASH = DashClass() # We tokenize first, then go back and stick the ranges together. tokens = [] has_following_subtraction = False while True: if position >= len(text): raise RegularExpressionError(position, "Incomplete character class expression, missing closing ']'") ch = text[position] if ch == six.u('['): # Only allowed if this is a subtraction if not tokens or tokens[-1] is not DASH: raise RegularExpressionError(position, "'[' character not allowed in character class") has_following_subtraction = True # For a character class subtraction, the "-[" are not part of the # posCharGroup, so undo reading the dash tokens.pop() position = position - 1 break elif ch == six.u(']'): # End break elif ch == six.unichr(0x5c): # backslash cps, position = _MatchCharClassEsc(text, position) single_char = cps.asSingleCharacter() if single_char is not None: tokens.append(single_char) else: tokens.append(cps) elif ch == six.u('-'): # We need to distinguish between "-" and "\-". So we use # DASH for a plain "-", and u"-" for a "\-". tokens.append(DASH) position = position + 1 else: tokens.append(ch) position = position + 1 if not tokens: raise RegularExpressionError(position, "Empty character class not allowed") # At the start or end of the character group, a dash has to be a literal if tokens[0] is DASH: tokens[0] = six.u('-') if tokens[-1] is DASH: tokens[-1] = six.u('-') result_cps = pyxb.utils.unicode.CodePointSet() cur_token = 0 while cur_token < len(tokens): start = tokens[cur_token] if cur_token + 2 < len(tokens) and tokens[cur_token + 1] is DASH: end = tokens[cur_token + 2] if not isinstance(start, six.text_type) or not isinstance(end, six.text_type): if start is DASH or end is DASH: raise RegularExpressionError(start_position, 'Two dashes in a row is not allowed in the middle of a character class.') raise RegularExpressionError(start_position, 'Dashes must be surrounded by characters, not character class escapes. %r %r' %(start, end)) if start > end: raise RegularExpressionError(start_position, 'Character ranges must have the lowest character first') result_cps.add((ord(start), ord(end))) cur_token = cur_token + 3 else: if start is DASH: raise RegularExpressionError(start_position, 'Dash without an initial character') elif isinstance(start, six.text_type): result_cps.add(ord(start)) else: result_cps.extend(start) cur_token = cur_token + 1 return result_cps, has_following_subtraction, position
def asSingleCharacter (self): """If this set represents a single character, return it as its unicode string value. Otherwise return C{None}.""" if (2 != len(self.__codepoints)) or (1 < (self.__codepoints[1] - self.__codepoints[0])): return None return six.unichr(self.__codepoints[0])