Esempio n. 1
0
def snap_set_list(stream, set_operation_name, PatternDict):
    __debug_entry("set_list", stream)

    skip_whitespace(stream)
    if stream.read(1) != "(":
        raise RegularExpressionException(
            "Missing opening bracket '%s' operation." % set_operation_name)

    set_list = []
    while 1 + 1 == 2:
        skip_whitespace(stream)
        result = snap_set_term(stream, PatternDict)
        if result is None:
            raise RegularExpressionException(
                "Missing set expression list after '%s' operation." %
                set_operation_name)
        set_list.append(result)
        skip_whitespace(stream)
        tmp = stream.read(1)
        if tmp != ",":
            if tmp != ")":
                stream.seek(-1, 1)
                raise RegularExpressionException(
                    "Missing closing ')' after after '%s' operation." %
                    set_operation_name)
            return __debug_exit(set_list, stream)
Esempio n. 2
0
def get_character_code_sequence(sh):
    assert     sh.__class__.__name__ == "StringIO" \
            or sh.__class__.__name__ == "file"

    # Only \" is a special character '"', any other backslashed character
    # remains as the sequence 'backslash' + character
    sequence = []
    while 1 + 1 == 2:
        char_code = utf8.__read_one_utf8_code_from_stream(sh)
        if char_code is None:
            raise RegularExpressionException(
                "End of file reached while parsing quoted string.")

        elif char_code == ord("\\"):
            char_code = snap_backslashed_character.do(sh)
            if char_code is None:
                raise RegularExpressionException(
                    "Unidentified backslash-sequence in quoted string.")

        elif char_code == ord('"'):
            break

        sequence.append(char_code)

    return sequence
Esempio n. 3
0
def __parse_property_expression(stream, PropertyLetter, EqualConditionPossibleF=True):
    """Parses an expression of the form '\? { X [ = Y] }' where
       ? = PropertyLetter. If the '=' operator is present then
       two fields are returned first = left hand side, second = 
       right hand side. Othewise an element is returned.
    """
    assert len(PropertyLetter) == 1
    assert type(PropertyLetter) == str
    assert type(EqualConditionPossibleF) == bool

    # verify '\?'
    x = stream.read(2)
    if x != "\\" + PropertyLetter: 
        raise RegularExpressionException("Unicode property letter '\\%s' expected, received '%s'." % x)
    
    skip_whitespace(stream)

    x = stream.read(1)
    if x != "{": 
        raise RegularExpressionException("Unicode property '\\%s' not followed by '{'." % PropertyLetter)

    content = __snap_until(stream, "}")
    
    fields = content.split("=")

    if len(fields) == 0:
        raise RegularExpressionException("Unicode property expression '\\%s{}' cannot have no content.")

    if len(fields) > 2:
        raise RegularExpressionException("Unicode property expression '\\%s' can have at maximum one '='.")

    if not EqualConditionPossibleF and len(fields) == 2:
        raise RegularExpressionException("Unicode property expression '\\%s' does not allow '=' conditions")

    return map(lambda x: x.strip(), fields)
Esempio n. 4
0
def snap_replacement(stream, PatternDict, StateMachineF=True):
    """Snaps a predefined pattern from the input string and returns the resulting
       state machine.
    """
    skip_whitespace(stream)
    pattern_name = read_identifier(stream)
    if pattern_name == "":
        raise RegularExpressionException(
            "Pattern replacement expression misses identifier after '{'.")
    skip_whitespace(stream)

    if not check(stream, "}"):
        raise RegularExpressionException("Pattern replacement expression misses closing '}' after '%s'." \
                                         % pattern_name)

    error.verify_word_in_list(
        pattern_name, PatternDict.keys(),
        "Specifier '%s' not found in any preceeding 'define { ... }' section."
        % pattern_name, stream)

    reference = PatternDict[pattern_name]
    assert reference.__class__ == PatternShorthand

    # The replacement may be a state machine or a number set
    if StateMachineF:
        # Get a cloned version of state machine
        state_machine = reference.get_state_machine()
        assert isinstance(state_machine, DFA)

        # It is essential that state machines defined as patterns do not
        # have origins. Otherwise, the optimization of patterns that
        # contain pattern replacements might get confused and can
        # not find all optimizations.
        assert not state_machine.has_specific_acceptance_id()

        # A state machine, that contains pre- or post- conditions cannot be part
        # of a replacement. The addition of new post-contexts would mess up the pattern.
        ## if state_machine.has_pre_or_post_context():
        ##    error.log("Pre- or post-conditioned pattern was used in replacement.\n" + \
        ##              "Quex's regular expression grammar does not allow this.", stream)

        return state_machine

    else:
        # Get a cloned version of character set
        character_set = reference.get_character_set()
        if character_set is None:
            error.log(
                "Replacement in character set expression must be a character set.\n"
                "Specifier '%s' relates to a pattern state machine." %
                pattern_name, stream)

        if character_set.is_empty():
            error.log(
                "Referenced character set '%s' is empty.\nAborted." %
                pattern_name, stream)

        return character_set
Esempio n. 5
0
def __snap_word(stream):
    try:
        the_word = read_until_letter(stream, ["("])
    except:
        raise RegularExpressionException("Missing opening bracket.")
    stream.seek(-1, 1)
    return the_word.strip()
Esempio n. 6
0
def snap_set_expression(stream, PatternDict):
    assert     stream.__class__.__name__ == "StringIO" \
            or stream.__class__.__name__ == "file"

    __debug_entry("set_expression", stream)

    result = snap_property_set(stream)
    if result is not None: return result

    x = stream.read(2)
    if x == "\\C":
        return snap_case_folded_pattern(stream, PatternDict, NumberSetF=True)

    elif x == "[:":
        result = snap_set_term(stream, PatternDict)
        skip_whitespace(stream)
        x = stream.read(2)
        if x != ":]":
            raise RegularExpressionException("Missing closing ':]' for character set expression.\n" + \
                                             "found: '%s'" % x)
    elif x[0] == "[":
        stream.seek(-1, 1)
        result = traditional_character_set.do(stream)

    elif x[0] == "{":
        stream.seek(-1, 1)
        result = snap_replacement(stream, PatternDict, StateMachineF=False)

    else:
        result = None

    return __debug_exit(result, stream)
Esempio n. 7
0
def snap_bracketed_expression(stream, PatternDict):
    position = stream.tell()
    result = snap_expression(stream, PatternDict)
    if not check(stream, ")"):
        stream.seek(position)
        remainder_txt = stream.readline().replace("\n", "").replace("\r", "")
        raise RegularExpressionException("Missing closing ')' after expression; found '%s'.\n" % remainder_txt \
                                         + "Note, that patterns end with the first non-quoted whitespace.\n" \
                                         + "Also, closing brackets in quotes do not close a syntax block.")

    if result is None:
        length = stream.tell() - position
        stream.seek(position)
        raise RegularExpressionException("expression in brackets has invalid syntax '%s'" % \
                                         stream.read(length))
    return result
Esempio n. 8
0
    def consider_interval(self, Begin, End):
        if Begin > End:
            raise RegularExpressionException("Character range: '-' requires character with 'lower code' to preceed\n" + \
                                             "found range '%s-%s' which corresponds to %i-%i as unicode code points." % \
                                             (utf8.map_unicode_to_utf8(Begin), utf8.map_unicode_to_utf8(End), Begin, End))

        self.match_set.add_interval(Interval(Begin, End))
Esempio n. 9
0
 def do(self, CharacterCode):
     if CharacterCode != ord("\""): return
     self.quote_n += 1
     if self.quote_n != 2: return
     raise RegularExpressionException(
         "Character '\"' appears twice in character range [ ... ] expression.\n"
         "You cannot exempt characters this way. Please, use backslash or\n"
         "split the character range expression.")
Esempio n. 10
0
def snap_set_term(stream, PatternDict):
    global special_character_set_db

    __debug_entry("set_term", stream)

    operation_list = ["union", "intersection", "difference", "inverse"]
    character_set_list = special_character_set_db.keys()

    skip_whitespace(stream)
    position = stream.tell()

    # if there is no following '(', then enter the 'snap_expression' block below
    word = read_identifier(stream)

    if word in operation_list:
        set_list = snap_set_list(stream, word, PatternDict)
        # if an error occurs during set_list parsing, an exception is thrown about syntax error

        L = len(set_list)
        result = set_list[0]

        if word == "inverse":
            # The inverse of multiple sets, is to be the inverse of the union of these sets.
            if L > 1:
                for character_set in set_list[1:]:
                    result.unite_with(character_set)
            return __debug_exit(
                result.get_complement(Setup.buffer_codec.source_set), stream)

        if L < 2:
            raise RegularExpressionException("Regular Expression: A %s operation needs at least\n" % word + \
                                             "two sets to operate on them.")

        if word == "union":
            for set in set_list[1:]:
                result.unite_with(set)
        elif word == "intersection":
            for set in set_list[1:]:
                result.intersect_with(set)
        elif word == "difference":
            for set in set_list[1:]:
                result.subtract(set)

    elif word in character_set_list:
        reg_expr = special_character_set_db[word]
        result = traditional_character_set.do_string(reg_expr)

    elif word != "":
        error.verify_word_in_list(word, character_set_list + operation_list,
                                  "Unknown keyword '%s'." % word, stream)
    else:
        stream.seek(position)
        result = snap_set_expression(stream, PatternDict)

    return __debug_exit(result, stream)
Esempio n. 11
0
def __snap_repetition_range(the_state_machine, stream):
    """Snaps a string that represents a repetition range. The following 
       syntaxes are supported:
           '?'      one or none repetition
           '+'      one or arbitrary repetition
           '*'      arbitrary repetition (even zero)
           '{n}'    exactly 'n' repetitions
           '{m,n}'  from 'm' to 'n' repetitions
           '{n,}'   arbitrary, but at least 'n' repetitions
    """
    assert the_state_machine.__class__.__name__ == "DFA", \
           "received object of type '%s'" % the_state_machine.__class__.__name__ + "\n" + \
           repr(the_state_machine)

    position_0 = stream.tell()
    x = stream.read(1)
    if x == "+": result = repeat.do(the_state_machine, 1)
    elif x == "*": result = repeat.do(the_state_machine)
    elif x == "?": result = repeat.do(the_state_machine, 0, 1)
    elif x == "{":
        repetition_range_str = __snap_until(stream, "}")
        if len(repetition_range_str) and not repetition_range_str[0].isdigit():
            # no repetition range, so everything remains as it is
            stream.seek(position_0)
            return the_state_machine

        try:
            if repetition_range_str.find(",") == -1:
                # no ',' thus "match exactly a certain number":
                # e.g. {4} = match exactly four repetitions
                number = int(repetition_range_str)
                result = repeat.do(the_state_machine, number, number)
                return result
            # a range of numbers is given
            fields = repetition_range_str.split(",")
            fields = map(lambda x: x.strip(), fields)

            number_1 = int(fields[0].strip())
            if fields[1] == "": number_2 = -1  # e.g. {2,}
            else: number_2 = int(fields[1].strip())  # e.g. {2,5}
            # produce repeated state machine
            result = repeat.do(the_state_machine, number_1, number_2)
            return result
        except:
            raise RegularExpressionException("error while parsing repetition range expression '%s'" \
                                             % repetition_range_str)
    else:
        # no repetition range, so everything remains as it is
        stream.seek(position_0)
        return the_state_machine

    return result
Esempio n. 12
0
def do(sh, ReducedSetOfBackslashedCharactersF=False):
    """All backslashed characters shall enter this function. In particular 
       backslashed characters appear in:
        
             "$50"     -- quoted strings
             [a-zA-Z]  -- character sets
             for       -- lonestanding characters 
    
       x = string containing characters after 'the backslash'
       i = position of the backslash in the given string

       ReducedSetOfBackslashedCharactersF indicates whether we are outside of a quoted
       string (lonestanding characters, sets, etc.) or inside a string. Inside a quoted
       string there are different rules, because not all control characters need to be
       considered.

       RETURNS: UCS code of the interpreted character,
                index of first element after the treated characters in the string
    """
    assert sh.__class__ == StringIO or sh.__class__ == file
    assert type(ReducedSetOfBackslashedCharactersF) == bool 

    if ReducedSetOfBackslashedCharactersF:
        backslashed_character_list = [ 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', '"' ]
    else:
        backslashed_character_list = backslashed_character_db.keys()

    tmp = sh.read(1)
    if tmp == "":
        raise RegularExpressionException("End of file while parsing backslash sequence.")

    if   tmp in backslashed_character_list: return backslashed_character_db[tmp]
    elif tmp.isdigit():                     sh.seek(-1,1); return __parse_octal_number(sh, 5)
    elif tmp == 'x':                        return __parse_hex_number(sh, 2)
    elif tmp == 'X':                        return __parse_hex_number(sh, 4)
    elif tmp == 'U':                        return __parse_hex_number(sh, 6)
    else:
        raise RegularExpressionException("Backslashed '%s' is unknown to quex." % tmp)
Esempio n. 13
0
def __snap_until(stream, ClosingDelimiter, OpeningDelimiter=None):
    """Cuts the first letters of the utf8_string until an un-backslashed
        Delimiter occurs.
     """
    cut_string = ""
    backslash_f = False
    open_bracket_n = 1
    while 1 + 1 == 2:
        letter = stream.read(1)
        if letter == "":
            raise RegularExpressionException(
                "Unable to find closing delimiter '%s'" % ClosingDelimiter)

        cut_string += letter

        if letter == "\\":
            backslash_f = not backslash_f
            continue

        elif letter == ClosingDelimiter and not backslash_f:
            if open_bracket_n == 1:
                cut_string = cut_string[:-1]
                break
            open_bracket_n -= 1

        elif letter == OpeningDelimiter and not backslash_f:
            # NOTE: if OpeningDelimiter is None, then this can never be the case!
            open_bracket_n += 1

        # if a backslash would have appeared, we would have 'continue'd (see above)
        backslash_f = False
    else:
        raise RegularExpressionException(
            "Unable to find closing delimiter '%s'" % ClosingDelimiter)

    return cut_string
Esempio n. 14
0
def __parse_base_number(sh, MaxL, DigitSet, Base, NumberName):
    """MaxL = Maximum length of number to be parsed.
    """
    number_str = ""
    tmp        = sh.read(1)
    while tmp != "" and tmp in DigitSet:
        number_str += tmp
        if len(number_str) == MaxL: break
        tmp = sh.read(1)
    else:
        if tmp != "": sh.seek(-1,1)
        
    if number_str == "": 
        raise RegularExpressionException("Missing %s number." % NumberName)

    return long(number_str, Base)
Esempio n. 15
0
def do_shortcut(stream, ShortcutLetter, PropertyAlias):
    """Name property shortcut '\ShortcutLetter{...}' which is a shortcut
       for '\P{PropertyAlias=...}'.
    
       Parse an expression of the form '\N{CHARACTER NAME}'
       and return the related character set of characters that 
       match the given name. Wildcards in are allowed.
    """
    content = __parse_property_expression(stream, ShortcutLetter, EqualConditionPossibleF=False)
    # if len(content) != 1 then an exception is thrown

    property_value = content[0]

    result = ucs_property_db.get_character_set(PropertyAlias, property_value)

    if type(result) == str:
        raise RegularExpressionException(result)

    return result
Esempio n. 16
0
def do(stream):
    """Property expression: '\P{...}'
    
       Parse an expression of the forms:

       '\P{property = value}' or '\P{binary_property}'

        and return the related character set.
    """
    content = __parse_property_expression(stream, "P")
    # if len(content) < 1 or > 2 then an exception is thrown

    property_name = content[0]
    if len(content) == 1: property_value = None
    else:                 property_value = content[1]

    result = ucs_property_db.get_character_set(property_name, property_value)

    if type(result) == str:
        raise RegularExpressionException(result)

    return result
Esempio n. 17
0
def snap_primary(stream, PatternDict):
    """primary:  " non_double_quote *  "              = character string
                 [ non_rect_bracket_close ]           = set of characters
                 { identifier }                       = pattern replacement
                 ( expression )
                 non_control_character+               = lonely characters
                 primary repetition_cmd
    """
    global SPECIAL_TERMINATOR

    __debug_entry("primary", stream)
    x = stream.read(1)
    if x == "": return __debug_exit(None, stream)

    # -- 'primary' primary
    if x == "\"": result = snap_character_string.do(stream)
    elif x == "[":
        stream.seek(-1, 1)
        result = snap_character_set_expression(stream, PatternDict)
    elif x == "{":
        result = snap_replacement(stream, PatternDict)
    elif x == ".":
        result = create_ALL_BUT_NEWLINE_state_machine(stream)
    elif x == "(":
        result = snap_bracketed_expression(stream, PatternDict)

    elif x.isspace():
        # a lonestanding space ends the regular expression
        stream.seek(-1, 1)
        return __debug_exit(None, stream)

    elif x in ["*", "+", "?"]:
        raise RegularExpressionException(
            "lonely operator '%s' without expression proceeding." % x)

    elif x == "\\":
        result = snap_command(stream, PatternDict)
        if result is None:
            stream.seek(-1, 1)
            trigger_set = snap_property_set(stream)
            if trigger_set is None:
                # snap the '\'
                stream.read(1)
                char_code = snap_backslashed_character.do(stream)
                if char_code is None:
                    raise RegularExpressionException(
                        "Backslash followed by unrecognized character code.")
                trigger_set = char_code
            result = DFA()
            result.add_transition(result.init_state_index,
                                  trigger_set,
                                  AcceptanceF=True)

    elif x not in CONTROL_CHARACTERS and x != SPECIAL_TERMINATOR:
        # NOTE: The '\' is not inside the control characters---for a reason.
        #       It is used to define for example character codes using '\x' etc.
        stream.seek(-1, 1)
        result = snap_non_control_character(stream, PatternDict)

    else:
        # NOTE: This includes the '$' sign which means 'end of line'
        #       because the '$' sign is in CONTROL_CHARACTERS, but is not checked
        #       against. Thus, it it good to leave here on '$' because the
        #       '$' sign is handled on the very top level.
        # this is not a valid primary
        stream.seek(-1, 1)
        return __debug_exit(None, stream)

    # -- optional repetition command?
    result_repeated = __snap_repetition_range(result, stream)
    if result_repeated is not None: result = result_repeated

    return __debug_exit(result, stream)
Esempio n. 18
0
def do(sh):
    """Transforms an expression of the form [a-z0-9A-Z] into a NumberSet of
       code points that corresponds to the characters and character ranges mentioned.
    """
    assert     sh.__class__.__name__ == "StringIO" \
            or sh.__class__.__name__ == "file"

    def __check_letter(stream, letter):
        position = stream.tell()
        if stream.read(1) == letter: return True
        else:
            stream.seek(position)
            return False

    # check, if the set is thought to be inverse (preceeded by '^')
    tracker = Tracker()

    if __check_letter(sh, "^"): tracker.negation_f = True

    char_code = None
    quote_checker = DoubleQuoteChecker(
    )  # Checks for " appearing twice. Some users did use
    #                                    # constructs such as "-" and ended up in confusing behavior.
    while 1 + 1 == 2:
        char_code = utf8.__read_one_utf8_code_from_stream(sh)

        quote_checker.do(char_code)
        if char_code == ord("-"):
            raise RegularExpressionException(
                "Character range operator '-' requires a preceding character as in 'a-z'."
            )
        elif char_code is None:
            raise RegularExpressionException(
                "Missing closing ']' in character range expression.")
        elif char_code == ord("]"):
            break
        elif char_code == ord("\\"):
            char_code = snap_backslashed_character.do(sh)

        if not __check_letter(sh, "-"):
            # (*) Normal character
            tracker.consider_letter(char_code)
        else:
            # (*) Character range:  'character0' '-' 'character1'
            char_code_2 = utf8.__read_one_utf8_code_from_stream(sh)
            quote_checker.do(char_code_2)
            if char_code_2 in [None, ord(']')]:
                raise RegularExpressionException(
                    "Character range: '-' requires a character following '-'.")
            elif char_code == ord("-"):
                raise RegularExpressionException(
                    "Character range operator '-' followed by '-'.")
            elif char_code_2 == ord("\\"):
                char_code_2 = snap_backslashed_character.do(sh)

            # value denotes 'end', i.e first character outside the interval => add 1
            if char_code == char_code_2:
                utf8_string = utf8.map_unicode_to_utf8(char_code)
                raise RegularExpressionException("Character range '%s-%s' has only one element.\n" \
                                                 % (utf8_string, utf8_string) + \
                                                 "In this case avoid range expression for clarity.")
            tracker.consider_interval(char_code, char_code_2 + 1)

        if char_code is None: break

    if tracker.negation_f:
        return tracker.match_set.get_complement(
            Setup.buffer_encoding.source_set)
    else:
        return tracker.match_set