Example #1
0
 def __unichr (self, code_point):
     rv = six.unichr(code_point)
     if 0 == code_point:
         rv = six.u('x00')
     if code_point in self.__XMLtoPythonREEscapedCodepoints:
         rv = six.unichr(0x5c) + rv
     return rv
Example #2
0
 def __unichr(self, code_point):
     rv = six.unichr(code_point)
     if 0 == code_point:
         rv = six.u('x00')
     if code_point in self.__XMLtoPythonREEscapedCodepoints:
         rv = six.unichr(0x5c) + rv
     return rv
Example #3
0
 def asSingleCharacter(self):
     """If this set represents a single character, return it as its
     unicode string value.  Otherwise return C{None}."""
     if (2 != len(self.__codepoints)) or (
             1 < (self.__codepoints[1] - self.__codepoints[0])):
         return None
     return six.unichr(self.__codepoints[0])
Example #4
0
def XMLToPython(pattern):
    """Convert the given pattern to the format required for Python
    regular expressions.

    @param pattern: A Unicode string defining a pattern consistent
    with U{XML regular
    expressions<http://www.w3.org/TR/xmlschema-2/index.html#regexs>}.

    @return: A Unicode string specifying a Python regular expression
    that matches the same language as C{pattern}."""
    assert isinstance(pattern, six.text_type)
    new_pattern_elts = []
    new_pattern_elts.append('^(')
    position = 0
    while position < len(pattern):
        cg = MaybeMatchCharacterClass(pattern, position)
        if cg is None:
            ch = pattern[position]
            if ch == six.u('^') or ch == six.u('$'):
                # These characters have no special meaning in XSD.  But they
                # match start and end of string in Python, so they have to
                # be escaped.
                new_pattern_elts.append(six.unichr(0x5c) + ch)
            else:
                new_pattern_elts.append(ch)
            position += 1
        else:
            (cps, position) = cg
            new_pattern_elts.append(cps.asPattern())
    new_pattern_elts.append(')$')
    return ''.join(new_pattern_elts)
Example #5
0
def XMLToPython (pattern):
    """Convert the given pattern to the format required for Python
    regular expressions.

    @param pattern: A Unicode string defining a pattern consistent
    with U{XML regular
    expressions<http://www.w3.org/TR/xmlschema-2/index.html#regexs>}.

    @return: A Unicode string specifying a Python regular expression
    that matches the same language as C{pattern}."""
    assert isinstance(pattern, six.text_type)
    new_pattern_elts = []
    new_pattern_elts.append('^(')
    position = 0
    while position < len(pattern):
        cg = MaybeMatchCharacterClass(pattern, position)
        if cg is None:
            ch = pattern[position]
            if ch == six.u('^') or ch == six.u('$'):
                # These characters have no special meaning in XSD.  But they
                # match start and end of string in Python, so they have to
                # be escaped.
                new_pattern_elts.append(six.unichr(0x5c) + ch)
            else:
                new_pattern_elts.append(ch)
            position += 1
        else:
            (cps, position) = cg
            new_pattern_elts.append(cps.asPattern())
    new_pattern_elts.append(')$')
    return ''.join(new_pattern_elts)
Example #6
0
def _InitializeAllEsc():
    """Set the values in _AllEsc without introducing C{k} and C{v} into
    the module."""

    _AllEsc.update({six.u('.'): pyxb.utils.unicode.WildcardEsc})
    bs = six.unichr(0x5c)
    for k, v in six.iteritems(pyxb.utils.unicode.SingleCharEsc):
        _AllEsc[bs + six.text_type(k)] = v
    for k, v in six.iteritems(pyxb.utils.unicode.MultiCharEsc):
        _AllEsc[bs + six.text_type(k)] = v
    for k, v in six.iteritems(pyxb.utils.unicode.catEsc):
        _AllEsc[bs + six.text_type(k)] = v
    for k, v in six.iteritems(pyxb.utils.unicode.complEsc):
        _AllEsc[bs + six.text_type(k)] = v
    for k, v in six.iteritems(pyxb.utils.unicode.IsBlockEsc):
        _AllEsc[bs + six.text_type(k)] = v
Example #7
0
def _InitializeAllEsc ():
    """Set the values in _AllEsc without introducing C{k} and C{v} into
    the module."""

    _AllEsc.update({ six.u('.'): pyxb.utils.unicode.WildcardEsc })
    bs = six.unichr(0x5c)
    for k, v in six.iteritems(pyxb.utils.unicode.SingleCharEsc):
        _AllEsc[bs + six.text_type(k)] = v
    for k, v in six.iteritems(pyxb.utils.unicode.MultiCharEsc):
        _AllEsc[bs + six.text_type(k)] = v
    for k, v in six.iteritems(pyxb.utils.unicode.catEsc):
        _AllEsc[bs + six.text_type(k)] = v
    for k, v in six.iteritems(pyxb.utils.unicode.complEsc):
        _AllEsc[bs + six.text_type(k)] = v
    for k, v in six.iteritems(pyxb.utils.unicode.IsBlockEsc):
        _AllEsc[bs + six.text_type(k)] = v
Example #8
0
def _MatchPosCharGroup(text, position):
    '''Parse a U{posCharGroup<http://www.w3.org/TR/xmlschema-2/#nt-posCharGroup>} term.

    @return: A tuple C{(cps, fs, p)} where:
      - C{cps} is a L{pyxb.utils.unicode.CodePointSet} containing the code points associated with the group;
      - C{fs} is a C{bool} that is C{True} if the next character is the C{-} in a U{charClassSub<http://www.w3.org/TR/xmlschema-2/#nt-charClassSub>} and C{False} if the group is not part of a charClassSub;
      - C{p} is the text offset immediately following the closing brace.

    @raise RegularExpressionError: if the expression is syntactically
    invalid.
    '''

    start_position = position

    # DASH is just some unique object, used as a marker.
    # It can't be unicode or a CodePointSet.
    class DashClass:
        pass

    DASH = DashClass()

    # We tokenize first, then go back and stick the ranges together.
    tokens = []
    has_following_subtraction = False
    while True:
        if position >= len(text):
            raise RegularExpressionError(
                position,
                "Incomplete character class expression, missing closing ']'")
        ch = text[position]
        if ch == six.u('['):
            # Only allowed if this is a subtraction
            if not tokens or tokens[-1] is not DASH:
                raise RegularExpressionError(
                    position, "'[' character not allowed in character class")
            has_following_subtraction = True
            # For a character class subtraction, the "-[" are not part of the
            # posCharGroup, so undo reading the dash
            tokens.pop()
            position = position - 1
            break
        elif ch == six.u(']'):
            # End
            break
        elif ch == six.unichr(0x5c):  # backslash
            cps, position = _MatchCharClassEsc(text, position)
            single_char = cps.asSingleCharacter()
            if single_char is not None:
                tokens.append(single_char)
            else:
                tokens.append(cps)
        elif ch == six.u('-'):
            # We need to distinguish between "-" and "\-".  So we use
            # DASH for a plain "-", and u"-" for a "\-".
            tokens.append(DASH)
            position = position + 1
        else:
            tokens.append(ch)
            position = position + 1

    if not tokens:
        raise RegularExpressionError(position,
                                     "Empty character class not allowed")

    # At the start or end of the character group, a dash has to be a literal
    if tokens[0] is DASH:
        tokens[0] = six.u('-')
    if tokens[-1] is DASH:
        tokens[-1] = six.u('-')
    result_cps = pyxb.utils.unicode.CodePointSet()
    cur_token = 0
    while cur_token < len(tokens):
        start = tokens[cur_token]
        if cur_token + 2 < len(tokens) and tokens[cur_token + 1] is DASH:
            end = tokens[cur_token + 2]
            if not isinstance(start, six.text_type) or not isinstance(
                    end, six.text_type):
                if start is DASH or end is DASH:
                    raise RegularExpressionError(
                        start_position,
                        'Two dashes in a row is not allowed in the middle of a character class.'
                    )
                raise RegularExpressionError(
                    start_position,
                    'Dashes must be surrounded by characters, not character class escapes. %r %r'
                    % (start, end))
            if start > end:
                raise RegularExpressionError(
                    start_position,
                    'Character ranges must have the lowest character first')
            result_cps.add((ord(start), ord(end)))
            cur_token = cur_token + 3
        else:
            if start is DASH:
                raise RegularExpressionError(
                    start_position, 'Dash without an initial character')
            elif isinstance(start, six.text_type):
                result_cps.add(ord(start))
            else:
                result_cps.extend(start)
            cur_token = cur_token + 1

    return result_cps, has_following_subtraction, position
Example #9
0
def _MatchPosCharGroup(text, position):
    '''Parse a U{posCharGroup<http://www.w3.org/TR/xmlschema-2/#nt-posCharGroup>} term.

    @return: A tuple C{(cps, fs, p)} where:
      - C{cps} is a L{pyxb.utils.unicode.CodePointSet} containing the code points associated with the group;
      - C{fs} is a C{bool} that is C{True} if the next character is the C{-} in a U{charClassSub<http://www.w3.org/TR/xmlschema-2/#nt-charClassSub>} and C{False} if the group is not part of a charClassSub;
      - C{p} is the text offset immediately following the closing brace.

    @raise RegularExpressionError: if the expression is syntactically
    invalid.
    '''

    start_position = position

    # DASH is just some unique object, used as a marker.
    # It can't be unicode or a CodePointSet.
    class DashClass:
        pass
    DASH = DashClass()

    # We tokenize first, then go back and stick the ranges together.
    tokens = []
    has_following_subtraction = False
    while True:
        if position >= len(text):
            raise RegularExpressionError(position, "Incomplete character class expression, missing closing ']'")
        ch = text[position]
        if ch == six.u('['):
            # Only allowed if this is a subtraction
            if not tokens or tokens[-1] is not DASH:
                raise RegularExpressionError(position, "'[' character not allowed in character class")
            has_following_subtraction = True
            # For a character class subtraction, the "-[" are not part of the
            # posCharGroup, so undo reading the dash
            tokens.pop()
            position = position - 1
            break
        elif ch == six.u(']'):
            # End
            break
        elif ch == six.unichr(0x5c): # backslash
            cps, position = _MatchCharClassEsc(text, position)
            single_char = cps.asSingleCharacter()
            if single_char is not None:
                tokens.append(single_char)
            else:
                tokens.append(cps)
        elif ch == six.u('-'):
            # We need to distinguish between "-" and "\-".  So we use
            # DASH for a plain "-", and u"-" for a "\-".
            tokens.append(DASH)
            position = position + 1
        else:
            tokens.append(ch)
            position = position + 1

    if not tokens:
        raise RegularExpressionError(position, "Empty character class not allowed")

    # At the start or end of the character group, a dash has to be a literal
    if tokens[0] is DASH:
        tokens[0] = six.u('-')
    if tokens[-1] is DASH:
        tokens[-1] = six.u('-')
    result_cps = pyxb.utils.unicode.CodePointSet()
    cur_token = 0
    while cur_token < len(tokens):
        start = tokens[cur_token]
        if cur_token + 2 < len(tokens) and tokens[cur_token + 1] is DASH:
            end = tokens[cur_token + 2]
            if not isinstance(start, six.text_type) or not isinstance(end, six.text_type):
                if start is DASH or end is DASH:
                    raise RegularExpressionError(start_position, 'Two dashes in a row is not allowed in the middle of a character class.')
                raise RegularExpressionError(start_position, 'Dashes must be surrounded by characters, not character class escapes. %r %r' %(start, end))
            if start > end:
                raise RegularExpressionError(start_position, 'Character ranges must have the lowest character first')
            result_cps.add((ord(start), ord(end)))
            cur_token = cur_token + 3
        else:
            if start is DASH:
                raise RegularExpressionError(start_position, 'Dash without an initial character')
            elif isinstance(start, six.text_type):
                result_cps.add(ord(start))
            else:
                result_cps.extend(start)
            cur_token = cur_token + 1

    return result_cps, has_following_subtraction, position
Example #10
0
 def asSingleCharacter (self):
     """If this set represents a single character, return it as its
     unicode string value.  Otherwise return C{None}."""
     if (2 != len(self.__codepoints)) or (1 < (self.__codepoints[1] - self.__codepoints[0])):
         return None
     return six.unichr(self.__codepoints[0])