Python __read_one_utf8_code_from_stream Exemples, quex.core_engine.utf8.__read_one_utf8_code_from_stream Python Exemples

Exemple #1

0

Afficher le fichier

def do(sh):
    """Transforms an expression of the form [a-z0-9A-Z] into a NumberSet of
       code points that corresponds to the characters and character ranges mentioned.
    """
    assert     sh.__class__.__name__ == "StringIO" \
            or sh.__class__.__name__ == "file"

    def __check_letter(stream, letter):
        position = stream.tell()
        if stream.read(1) == letter: return True
        else:
            stream.seek(position)
            return False

    # check, if the set is thought to be inverse (preceeded by '^')
    tracker = Tracker()

    if __check_letter(sh, "^"): tracker.negation_f = True

    char_code = None
    while char_code != 0xFF:
        char_code = utf8.__read_one_utf8_code_from_stream(sh)

        if char_code == ord("-"):
            raise RegularExpressionException(
                "Character range operator '-' requires a preceding character as in 'a-z'."
            )
        elif char_code == 0xFF:
            raise RegularExpressionException(
                "Missing closing ']' in character range expression.")
        elif char_code == ord("]"):
            break
        elif char_code == ord("\\"):
            char_code = snap_backslashed_character.do(sh)

        if not __check_letter(sh, "-"):
            # (*) Normal character
            tracker.consider_letter(char_code)
        else:
            # (*) Character range:  'character0' '-' 'character1'
            char_code_2 = utf8.__read_one_utf8_code_from_stream(sh)
            if char_code_2 in [0xFF, ord(']')]:
                raise RegularExpressionException(
                    "Character range: '-' requires a character following '-'.")
            elif char_code == ord("-"):
                raise RegularExpressionException(
                    "Character range operator '-' followed by '-'.")
            elif char_code_2 == ord("\\"):
                char_code_2 = snap_backslashed_character.do(sh)

            # value denotes 'end', i.e first character outside the interval => add 1
            tracker.consider_interval(char_code, char_code_2 + 1)

    if tracker.negation_f: return tracker.match_set.inverse()
    else: return tracker.match_set

Exemple #2

0

Afficher le fichier

Fichier : traditional_character_set.py Projet : AugustoMoura/GritEnginePR

def do(sh):
    """Transforms an expression of the form [a-z0-9A-Z] into a NumberSet of
       code points that corresponds to the characters and character ranges mentioned.
    """
    assert     sh.__class__.__name__ == "StringIO" \
            or sh.__class__.__name__ == "file"

    def __check_letter(stream, letter):
        position = stream.tell()
        if stream.read(1) == letter: return True
        else:                        stream.seek(position); return False

    # check, if the set is thought to be inverse (preceeded by '^')
    tracker = Tracker()

    if __check_letter(sh, "^"): tracker.negation_f = True

    char_code = None
    while char_code != 0xFF:
        char_code = utf8.__read_one_utf8_code_from_stream(sh)

        if char_code == ord("-"):
            raise RegularExpressionException("Character range operator '-' requires a preceding character as in 'a-z'.")
        elif char_code == 0xFF: 
            raise RegularExpressionException("Missing closing ']' in character range expression.")
        elif char_code == ord("]"):
            break
        elif char_code == ord("\\"):
            char_code = snap_backslashed_character.do(sh)

        if not __check_letter(sh, "-"): 
            # (*) Normal character
            tracker.consider_letter(char_code)
        else:
            # (*) Character range:  'character0' '-' 'character1'
            char_code_2 = utf8.__read_one_utf8_code_from_stream(sh)
            if char_code_2 in [0xFF, ord(']')]: 
                raise RegularExpressionException("Character range: '-' requires a character following '-'.")
            elif char_code == ord("-"):
                raise RegularExpressionException("Character range operator '-' followed by '-'.")
            elif char_code_2 == ord("\\"): 
                char_code_2 = snap_backslashed_character.do(sh)  

            # value denotes 'end', i.e first character outside the interval => add 1
            tracker.consider_interval(char_code, char_code_2 + 1)

    if tracker.negation_f: return tracker.match_set.inverse()
    else:                  return tracker.match_set

Exemple #3

0

Afficher le fichier

Fichier : snap_character_string.py Projet : yura9889/grit-engine

def get_character_code_sequence(sh):
    assert     sh.__class__.__name__ == "StringIO" \
            or sh.__class__.__name__ == "file"

    # Only \" is a special character '"', any other backslashed character
    # remains as the sequence 'backslash' + character
    sequence = []
    while 1 + 1 == 2:
        char_code = utf8.__read_one_utf8_code_from_stream(sh)
        if char_code == 0xFF:
            raise RegularExpressionException(
                "End of file reached while parsing quoted string.")

        elif char_code == ord("\\"):
            char_code = snap_backslashed_character.do(sh)
            if char_code == None:
                raise RegularExpressionException(
                    "Unidentified backslash-sequence in quoted string.")

        elif char_code == ord('"'):
            break

        sequence.append(char_code)

    return sequence

Exemple #4

0

Afficher le fichier

def snap_non_control_character(stream, PatternDict):
    __debug_entry("non-control characters", stream)

    # (*) read first character
    char_code = utf8.__read_one_utf8_code_from_stream(stream)
    if char_code == 0xFF:
        error_msg(
            "Character could not be interpreted as UTF8 code or End of File reached prematurely.",
            stream)
    result = StateMachine()
    result.add_transition(result.init_state_index, char_code, AcceptanceF=True)
    return __debug_exit(result, stream)

Exemple #5

0

Afficher le fichier

Fichier : code_fragment.py Projet : jirkamarsik/quex

def read_character_code(fh):
    # NOTE: This function is tested with the regeression test for feature request 2251359.
    #       See directory $QUEX_PATH/TEST/2251359.
    pos = fh.tell()
    
    start = fh.read(1)
    if start == "":  
        seek(pos); return -1

    elif start == "'": 
        # read an utf-8 char an get the token-id
        # Example: '+'
        if check(fh, "\\"):
            # snap_backslashed_character throws an exception if 'backslashed char' is nonsense.
            character_code = snap_backslashed_character.do(fh, ReducedSetOfBackslashedCharactersF=True)
        else:
            character_code = __read_one_utf8_code_from_stream(fh)

        if character_code == 0xFF:
            error_msg("Missing utf8-character for definition of character code by character.", fh)

        elif fh.read(1) != '\'':
            error_msg("Missing closing ' for definition of character code by character.", fh)

        return character_code

    if start == "U":
        if fh.read(1) != "C": seek(pos); return -1
        # read Unicode Name 
        # Example: UC MATHEMATICAL_MONOSPACE_DIGIT_FIVE
        skip_whitespace(fh)
        ucs_name = __read_token_identifier(fh)
        if ucs_name == "": seek(pos); return -1
        # Get the character set related to the given name. Note, the size of the set
        # is supposed to be one.
        character_code = ucs_property_db.get_character_set("Name", ucs_name)
        if type(character_code) in [str, unicode]:
            verify_word_in_list(ucs_name, ucs_property_db["Name"].code_point_db,
                                "The string %s\ndoes not identify a known unicode character." % ucs_name, 
                                fh)
        elif type(character_code) not in [int, long]:
            error_msg("%s relates to more than one character in unicode database." % ucs_name, fh) 
        return character_code

    fh.seek(pos)
    character_code = read_integer(fh)
    if character_code != None: return character_code

    # Try to interpret it as something else ...
    fh.seek(pos)
    return -1

Exemple #6

0

Afficher le fichier

Fichier : snap_character_string.py Projet : AugustoMoura/GritEnginePR

def get_character_code_sequence(sh):
    assert     sh.__class__.__name__ == "StringIO" \
            or sh.__class__.__name__ == "file"

    # Only \" is a special character '"', any other backslashed character
    # remains as the sequence 'backslash' + character
    sequence = []
    while 1 + 1 == 2:
        char_code = utf8.__read_one_utf8_code_from_stream(sh)
        if char_code == 0xFF:
            raise RegularExpressionException("End of file reached while parsing quoted string.")

        elif char_code == ord("\\"): 
            char_code = snap_backslashed_character.do(sh)
            if char_code == None: 
                raise RegularExpressionException("Unidentified backslash-sequence in quoted string.")

        elif char_code == ord('"'):
            break

        sequence.append(char_code)

    return sequence

Exemple #7

0

Afficher le fichier

Fichier : code_fragment.py Projet : AugustoMoura/GritEnginePR

def read_character_code(fh):
    # NOTE: This function is tested with the regeression test for feature request 2251359.
    #       See directory $QUEX_PATH/TEST/2251359.
    pos = fh.tell()
    
    start = fh.read(1)
    if start == "":  
        seek(pos); return -1

    elif start == "'": 
        # read an utf-8 char an get the token-id
        # Example: '+'
        character_code = __read_one_utf8_code_from_stream(fh)
        if character_code == 0xFF:
            error_msg("Missing utf8-character for definition of character code by character.", fh)

        elif fh.read(1) != '\'':
            error_msg("Missing closing ' for definition of character code by character.", fh)

        return character_code

    if start == "U":
        if fh.read(1) != "C": seek(pos); return -1
        # read Unicode Name 
        # Example: UC MATHEMATICAL_MONOSPACE_DIGIT_FIVE
        skip_whitespace(fh)
        ucs_name = read_identifier(fh)
        if ucs_name == "": seek(pos); return -1
        # Get the character set related to the given name. Note, the size of the set
        # is supposed to be one.
        character_code = ucs_property_db.get_character_set("Name", ucs_name)
        if type(character_code) in [str, unicode]:
            error_msg("%s does not identify a known unicode character." % ucs_name, fh) 
        if type(character_code) not in [int, long]:
            error_msg("%s relates to more than one character in unicode database." % ucs_name, fh) 
        return character_code

    second = fh.read(1)
    if start == "0" and second.isdigit() == False:
        base = second
        if base not in ["x", "o", "b"]: 
            error_msg("Number base '0%s' is unknown, please use '0x' for hexidecimal,\n" % base + \
                      "'0o' for octal, or '0b' for binary.", fh)
        number_txt = read_integer(fh)
        if number_txt == "":
            error_msg("Missing integer number after '0%s'" % base, fh)
        try: 
            if   base == "x": character_code = int("0x" + number_txt, 16) 
            elif base == "o": character_code = int(number_txt, 8) 
            elif base == "b": 
                character_code = 0
                for letter in number_txt:
                    character_code = character_code << 1
                    if   letter == "1":   character_code += 1
                    elif letter != "0":
                        error_msg("Letter '%s' not permitted in binary number (something start with '0b')" % letter, fh)
            else:
                # A normal integer number (starting with '0' though)
                character_code = int(base + number_text)
        except:
            error_msg("The string '%s' is not appropriate for number base '0%s'." % (number_txt, base), fh)

        return character_code

    elif start.isdigit():
        fh.seek(-2, 1) # undo 'start' and 'second'
        # All that remains is that it is a 'normal' integer
        number_txt = read_integer(fh)

        if number_txt == "": fh.seek(pos); return -1
        
        try:    return int(number_txt)
        except: error_msg("The string '%s' is not appropriate for number base '10'." % number_txt, fh)

    else:
        # Try to interpret it as something else ...
        fh.seek(pos); return -1

Exemple #8

0

Afficher le fichier

def read_character_code(fh):
    # NOTE: This function is tested with the regeression test for feature request 2251359.
    #       See directory $QUEX_PATH/TEST/2251359.
    pos = fh.tell()

    start = fh.read(1)
    if start == "":
        seek(pos)
        return -1

    elif start == "'":
        # read an utf-8 char an get the token-id
        # Example: '+'
        character_code = __read_one_utf8_code_from_stream(fh)
        if character_code == 0xFF:
            error_msg(
                "Missing utf8-character for definition of character code by character.",
                fh)

        elif fh.read(1) != '\'':
            error_msg(
                "Missing closing ' for definition of character code by character.",
                fh)

        return character_code

    if start == "U":
        if fh.read(1) != "C":
            seek(pos)
            return -1
        # read Unicode Name
        # Example: UC MATHEMATICAL_MONOSPACE_DIGIT_FIVE
        skip_whitespace(fh)
        ucs_name = read_identifier(fh)
        if ucs_name == "":
            seek(pos)
            return -1
        # Get the character set related to the given name. Note, the size of the set
        # is supposed to be one.
        character_code = ucs_property_db.get_character_set("Name", ucs_name)
        if type(character_code) in [str, unicode]:
            error_msg(
                "%s does not identify a known unicode character." % ucs_name,
                fh)
        if type(character_code) not in [int, long]:
            error_msg(
                "%s relates to more than one character in unicode database." %
                ucs_name, fh)
        return character_code

    second = fh.read(1)
    if start == "0" and second.isdigit() == False:
        base = second
        if base not in ["x", "o", "b"]:
            error_msg("Number base '0%s' is unknown, please use '0x' for hexidecimal,\n" % base + \
                      "'0o' for octal, or '0b' for binary.", fh)
        number_txt = read_integer(fh)
        if number_txt == "":
            error_msg("Missing integer number after '0%s'" % base, fh)
        try:
            if base == "x": character_code = int("0x" + number_txt, 16)
            elif base == "o": character_code = int(number_txt, 8)
            elif base == "b":
                character_code = 0
                for letter in number_txt:
                    character_code = character_code << 1
                    if letter == "1": character_code += 1
                    elif letter != "0":
                        error_msg(
                            "Letter '%s' not permitted in binary number (something start with '0b')"
                            % letter, fh)
            else:
                # A normal integer number (starting with '0' though)
                character_code = int(base + number_text)
        except:
            error_msg(
                "The string '%s' is not appropriate for number base '0%s'." %
                (number_txt, base), fh)

        return character_code

    elif start.isdigit():
        fh.seek(-2, 1)  # undo 'start' and 'second'
        # All that remains is that it is a 'normal' integer
        number_txt = read_integer(fh)

        if number_txt == "":
            fh.seek(pos)
            return -1

        try:
            return int(number_txt)
        except:
            error_msg(
                "The string '%s' is not appropriate for number base '10'." %
                number_txt, fh)

    else:
        # Try to interpret it as something else ...
        fh.seek(pos)
        return -1

Exemple #9

0

Afficher le fichier

Fichier : traditional_character_set.py Projet : jirkamarsik/quex

def do(sh):
    """Transforms an expression of the form [a-z0-9A-Z] into a NumberSet of
       code points that corresponds to the characters and character ranges mentioned.
    """
    assert     sh.__class__.__name__ == "StringIO" \
            or sh.__class__.__name__ == "file"

    def __check_letter(stream, letter):
        position = stream.tell()
        if stream.read(1) == letter: return True
        else:                        stream.seek(position); return False

    # check, if the set is thought to be inverse (preceeded by '^')
    tracker = Tracker()

    if __check_letter(sh, "^"): tracker.negation_f = True

    char_code     = None
    quote_checker = DoubleQuoteChecker() # Checks for " appearing twice. Some users did use
    #                                    # constructs such as "-" and ended up in confusing behavior.
    while char_code != 0xFF:
        char_code = utf8.__read_one_utf8_code_from_stream(sh)

        quote_checker.do(char_code)
        if char_code == ord("-"):
            raise RegularExpressionException("Character range operator '-' requires a preceding character as in 'a-z'.")
        elif char_code == 0xFF: 
            raise RegularExpressionException("Missing closing ']' in character range expression.")
        elif char_code == ord("]"):
            break
        elif char_code == ord("\\"):
            char_code = snap_backslashed_character.do(sh)

        if not __check_letter(sh, "-"): 
            # (*) Normal character
            tracker.consider_letter(char_code)
        else:
            # (*) Character range:  'character0' '-' 'character1'
            char_code_2 = utf8.__read_one_utf8_code_from_stream(sh)
            quote_checker.do(char_code_2)
            if char_code_2 in [0xFF, ord(']')]: 
                raise RegularExpressionException("Character range: '-' requires a character following '-'.")
            elif char_code == ord("-"):
                raise RegularExpressionException("Character range operator '-' followed by '-'.")
            elif char_code_2 == ord("\\"): 
                char_code_2 = snap_backslashed_character.do(sh)  

            # value denotes 'end', i.e first character outside the interval => add 1
            if char_code == char_code_2:
                utf8_string = utf8.map_unicode_to_utf8(char_code)
                raise RegularExpressionException("Character range '%s-%s' has only one element.\n" \
                                                 % (utf8_string, utf8_string) + \
                                                 "In this case avoid range expression for clarity.")
            tracker.consider_interval(char_code, char_code_2 + 1)

    if tracker.negation_f: 
        result = tracker.match_set.inverse()
        if Setup.get_character_value_limit() != sys.maxint:
            result.intersect_with(Interval(0, Setup.get_character_value_limit()))
        return result
    else:                  
        return tracker.match_set

Exemple #10

0

Afficher le fichier

def do(sh):
    """Transforms an expression of the form [a-z0-9A-Z] into a NumberSet of
       code points that corresponds to the characters and character ranges mentioned.
    """
    assert     sh.__class__.__name__ == "StringIO" \
            or sh.__class__.__name__ == "file"

    def __check_letter(stream, letter):
        position = stream.tell()
        if stream.read(1) == letter: return True
        else:
            stream.seek(position)
            return False

    # check, if the set is thought to be inverse (preceeded by '^')
    tracker = Tracker()

    if __check_letter(sh, "^"): tracker.negation_f = True

    char_code = None
    quote_checker = DoubleQuoteChecker(
    )  # Checks for " appearing twice. Some users did use
    #                                    # constructs such as "-" and ended up in confusing behavior.
    while char_code != 0xFF:
        char_code = utf8.__read_one_utf8_code_from_stream(sh)

        quote_checker.do(char_code)
        if char_code == ord("-"):
            raise RegularExpressionException(
                "Character range operator '-' requires a preceding character as in 'a-z'."
            )
        elif char_code == 0xFF:
            raise RegularExpressionException(
                "Missing closing ']' in character range expression.")
        elif char_code == ord("]"):
            break
        elif char_code == ord("\\"):
            char_code = snap_backslashed_character.do(sh)

        if not __check_letter(sh, "-"):
            # (*) Normal character
            tracker.consider_letter(char_code)
        else:
            # (*) Character range:  'character0' '-' 'character1'
            char_code_2 = utf8.__read_one_utf8_code_from_stream(sh)
            quote_checker.do(char_code_2)
            if char_code_2 in [0xFF, ord(']')]:
                raise RegularExpressionException(
                    "Character range: '-' requires a character following '-'.")
            elif char_code == ord("-"):
                raise RegularExpressionException(
                    "Character range operator '-' followed by '-'.")
            elif char_code_2 == ord("\\"):
                char_code_2 = snap_backslashed_character.do(sh)

            # value denotes 'end', i.e first character outside the interval => add 1
            if char_code == char_code_2:
                utf8_string = utf8.map_unicode_to_utf8(char_code)
                raise RegularExpressionException("Character range '%s-%s' has only one element.\n" \
                                                 % (utf8_string, utf8_string) + \
                                                 "In this case avoid range expression for clarity.")
            tracker.consider_interval(char_code, char_code_2 + 1)

    if tracker.negation_f:
        result = tracker.match_set.inverse()
        if Setup.get_character_value_limit() != sys.maxint:
            result.intersect_with(
                Interval(0, Setup.get_character_value_limit()))
        return result
    else:
        return tracker.match_set

Exemple #11

0

Afficher le fichier

def read_character_code(fh):
    # NOTE: This function is tested with the regeression test for feature request 2251359.
    #       See directory $QUEX_PATH/TEST/2251359.
    pos = fh.tell()

    start = fh.read(1)
    if start == "":
        seek(pos)
        return -1

    elif start == "'":
        # read an utf-8 char an get the token-id
        # Example: '+'
        if check(fh, "\\"):
            # snap_backslashed_character throws an exception if 'backslashed char' is nonsense.
            character_code = snap_backslashed_character.do(
                fh, ReducedSetOfBackslashedCharactersF=True)
        else:
            character_code = __read_one_utf8_code_from_stream(fh)

        if character_code == 0xFF:
            error_msg(
                "Missing utf8-character for definition of character code by character.",
                fh)

        elif fh.read(1) != '\'':
            error_msg(
                "Missing closing ' for definition of character code by character.",
                fh)

        return character_code

    if start == "U":
        if fh.read(1) != "C":
            seek(pos)
            return -1
        # read Unicode Name
        # Example: UC MATHEMATICAL_MONOSPACE_DIGIT_FIVE
        skip_whitespace(fh)
        ucs_name = __read_token_identifier(fh)
        if ucs_name == "":
            seek(pos)
            return -1
        # Get the character set related to the given name. Note, the size of the set
        # is supposed to be one.
        character_code = ucs_property_db.get_character_set("Name", ucs_name)
        if type(character_code) in [str, unicode]:
            verify_word_in_list(
                ucs_name, ucs_property_db["Name"].code_point_db,
                "The string %s\ndoes not identify a known unicode character." %
                ucs_name, fh)
        elif type(character_code) not in [int, long]:
            error_msg(
                "%s relates to more than one character in unicode database." %
                ucs_name, fh)
        return character_code

    fh.seek(pos)
    character_code = read_integer(fh)
    if character_code != None: return character_code

    # Try to interpret it as something else ...
    fh.seek(pos)
    return -1

Exemple #12

0

Afficher le fichier

Fichier : core.py Projet : AugustoMoura/GritEnginePR

def snap_non_control_characters(stream):
    """Snaps any 'non_control_character' using UTF8 encoding from the given string. Note, that 
       in UTF8 a character may consist of more than one byte. Creates a state machine 
       that contains solely one trigger for each character to a acceptance state.

       This function **concatinates** incoming characters, but **repetition** has preceedence
       over concatination, so it checks after each character whether it is followed by
       a repetition ('*', '+', '?', '{..}'). In such a case, the repetition of the character
       is appended.
    """
    __debug_entry("non-control characters", stream)

    result      = StateMachine()
    state_index = result.init_state_index
    # (*) read first character
    position  = stream.tell()
    char_code = utf8.__read_one_utf8_code_from_stream(stream)
    while char_code != 0xFF:
        # (1) check against occurence of control characters
        #     this needs to come **before** the backslashed character interpretation.
        #     NOTE: A backslashed character can be a whitespace (for example '\n'). 
        #     (check against 0xFF to avoid overflow in function 'chr()') 
        if char_code < 0xFF \
           and (chr(char_code) in CONTROL_CHARACTERS or chr(char_code).isspace()):
               stream.seek(-1, 1) 
               break 

        # (2) treat backslashed characters
        if char_code == ord('\\'):
            stream.seek(-1, 1)
            trigger_set = character_set_expression.snap_property_set(stream)
            if trigger_set == None:
                stream.seek(1, 1)  # snap_property_set() leaves tream right before '\\'
                char_code = snap_backslashed_character.do(stream)
                if char_code == None:
                    raise RegularExpressionException("Backslash followed by unrecognized character code.")
                trigger_set = char_code
        else:
            trigger_set = char_code

        # (3) read next character
        position       = stream.tell()
        next_char_code = utf8.__read_one_utf8_code_from_stream(stream)
        #    -- check for repetition (repetition has preceedence over concatination)
        if next_char_code in [ord("+"), ord("*"), ord("?"), ord("{")]:
            # (*) create state machine that consist of a single transition 
            tmp = StateMachine()
            tmp.add_transition(tmp.init_state_index, trigger_set, AcceptanceF=True)
            # -- repeat the single character state machine
            stream.seek(position)
            tmp_repeated = __snap_repetition_range(tmp, stream) 
            # -- append it to the result (last state must be set to acceptance for concatenation)
            result.states[state_index].set_acceptance()
            result = sequentialize.do([result, tmp_repeated], MountToFirstStateMachineF=True)
            # as soon as there is repetition there might be more than one acceptance
            # state and thus simple concatination via 'add_transition' fails.
            # let us return and check treat the remaining chars
            # at the next call to this function.
            return __debug_exit(result, stream)

        else:
            # (*) add new transition from current state to a new state triggering
            #     on the given character.
            state_index = result.add_transition(state_index, trigger_set)

        char_code = next_char_code

    # last character in the chain triggers an 'acceptance state'
    result.states[state_index].set_acceptance()
        
    return __debug_exit(result, stream)

Exemple #13

0

Afficher le fichier

Fichier : core.py Projet : yura9889/grit-engine

def snap_non_control_characters(stream):
    """Snaps any 'non_control_character' using UTF8 encoding from the given string. Note, that 
       in UTF8 a character may consist of more than one byte. Creates a state machine 
       that contains solely one trigger for each character to a acceptance state.

       This function **concatinates** incoming characters, but **repetition** has preceedence
       over concatination, so it checks after each character whether it is followed by
       a repetition ('*', '+', '?', '{..}'). In such a case, the repetition of the character
       is appended.
    """
    __debug_entry("non-control characters", stream)

    result = StateMachine()
    state_index = result.init_state_index
    # (*) read first character
    position = stream.tell()
    char_code = utf8.__read_one_utf8_code_from_stream(stream)
    while char_code != 0xFF:
        # (1) check against occurence of control characters
        #     this needs to come **before** the backslashed character interpretation.
        #     NOTE: A backslashed character can be a whitespace (for example '\n').
        #     (check against 0xFF to avoid overflow in function 'chr()')
        if char_code < 0xFF \
           and (chr(char_code) in CONTROL_CHARACTERS or chr(char_code).isspace()):
            stream.seek(-1, 1)
            break

        # (2) treat backslashed characters
        if char_code == ord('\\'):
            stream.seek(-1, 1)
            trigger_set = character_set_expression.snap_property_set(stream)
            if trigger_set == None:
                stream.seek(
                    1, 1)  # snap_property_set() leaves tream right before '\\'
                char_code = snap_backslashed_character.do(stream)
                if char_code == None:
                    raise RegularExpressionException(
                        "Backslash followed by unrecognized character code.")
                trigger_set = char_code
        else:
            trigger_set = char_code

        # (3) read next character
        position = stream.tell()
        next_char_code = utf8.__read_one_utf8_code_from_stream(stream)
        #    -- check for repetition (repetition has preceedence over concatination)
        if next_char_code in [ord("+"), ord("*"), ord("?"), ord("{")]:
            # (*) create state machine that consist of a single transition
            tmp = StateMachine()
            tmp.add_transition(tmp.init_state_index,
                               trigger_set,
                               AcceptanceF=True)
            # -- repeat the single character state machine
            stream.seek(position)
            tmp_repeated = __snap_repetition_range(tmp, stream)
            # -- append it to the result (last state must be set to acceptance for concatenation)
            result.states[state_index].set_acceptance()
            result = sequentialize.do([result, tmp_repeated],
                                      MountToFirstStateMachineF=True)
            # as soon as there is repetition there might be more than one acceptance
            # state and thus simple concatination via 'add_transition' fails.
            # let us return and check treat the remaining chars
            # at the next call to this function.
            return __debug_exit(result, stream)

        else:
            # (*) add new transition from current state to a new state triggering
            #     on the given character.
            state_index = result.add_transition(state_index, trigger_set)

        char_code = next_char_code

    # last character in the chain triggers an 'acceptance state'
    result.states[state_index].set_acceptance()

    return __debug_exit(result, stream)