Exemple #1
0
def snap_set_term(stream, PatternDict):
    __debug_entry("set_term", stream)

    operation_list = ["union", "intersection", "difference", "inverse"]
    character_set_list = special_character_set_db().keys()

    skip_whitespace(stream)
    position = stream.tell()

    # if there is no following '(', then enter the 'snap_expression' block below
    word = read_identifier(stream)

    if word in operation_list:
        set_list = snap_set_list(stream, word, PatternDict)
        # if an error occurs during set_list parsing, an exception is thrown about syntax error

        L = len(set_list)
        result = set_list[0]

        if word == "inverse":
            # The inverse of multiple sets, is to be the inverse of the union of these sets.
            if L > 1:
                for character_set in set_list[1:]:
                    result.unite_with(character_set)
            result = result.inverse()
            if Setup.get_character_value_limit() != -1:
                result.intersect_with(
                    Interval(0, Setup.get_character_value_limit()))
            return __debug_exit(result, stream)

        if L < 2:
            raise RegularExpressionException("Regular Expression: A %s operation needs at least\n" % word + \
                                             "two sets to operate on them.")

        if word == "union":
            for set in set_list[1:]:
                result.unite_with(set)
        elif word == "intersection":
            for set in set_list[1:]:
                result.intersect_with(set)
        elif word == "difference":
            for set in set_list[1:]:
                result.subtract(set)

    elif word in character_set_list:
        result = special_character_set_db()[word]

    elif word != "":
        verify_word_in_list(word, character_set_list + operation_list,
                            "Unknown keyword '%s'." % word, stream)
    else:
        stream.seek(position)
        result = snap_set_expression(stream, PatternDict)

    return __debug_exit(result, stream)
def snap_set_term(stream, PatternDict):
    __debug_entry("set_term", stream)    

    operation_list     = [ "union", "intersection", "difference", "inverse"]
    character_set_list = special_character_set_db().keys()

    skip_whitespace(stream)
    position = stream.tell()

    # if there is no following '(', then enter the 'snap_expression' block below
    word = read_identifier(stream)

    if word in operation_list: 
        set_list = snap_set_list(stream, word, PatternDict)
        # if an error occurs during set_list parsing, an exception is thrown about syntax error

        L      = len(set_list)
        result = set_list[0]

        if word == "inverse":
            # The inverse of multiple sets, is to be the inverse of the union of these sets.
            if L > 1:
                for character_set in set_list[1:]:
                    result.unite_with(character_set)
            result = result.inverse()
            if Setup.get_character_value_limit() != -1:
                result.intersect_with(Interval(0, Setup.get_character_value_limit()))
            return __debug_exit(result, stream)

        if L < 2:
            raise RegularExpressionException("Regular Expression: A %s operation needs at least\n" % word + \
                                             "two sets to operate on them.")
            
        if   word == "union":
            for set in set_list[1:]:
                result.unite_with(set)
        elif word == "intersection":
            for set in set_list[1:]:
                result.intersect_with(set)
        elif word == "difference":
            for set in set_list[1:]:
                result.subtract(set)

    elif word in character_set_list:
        result = special_character_set_db()[word]

    elif word != "":
        verify_word_in_list(word, character_set_list + operation_list, 
                            "Unknown keyword '%s'." % word, stream)
    else:
        stream.seek(position)
        result = snap_set_expression(stream, PatternDict)

    return __debug_exit(result, stream)
Exemple #3
0
def create_ALL_BUT_NEWLINE_state_machine():
    global Setup
    result = StateMachine()
    # NOTE: Buffer control characters are supposed to be filtered out by the code
    #       generator.
    trigger_set = NumberSet(Interval(ord("\n")).inverse())

    if Setup.get_character_value_limit() != sys.maxint:
        trigger_set.intersect_with(
            Interval(0, Setup.get_character_value_limit()))

    result.add_transition(result.init_state_index,
                          trigger_set,
                          AcceptanceF=True)
    return result
Exemple #4
0
def __delete_forbidden_ranges(sm, fh):
    """Unicode does define all code points >= 0. Thus there can be no code points
       below zero as it might result from some number set operations.

       NOTE: This operation might result in orphaned states that have to 
             be deleted.
    """
    global Setup

    character_value_limit = Setup.get_character_value_limit()
    for state in sm.states.values():

        for target_state_index, trigger_set in state.transitions().get_map().items():

            # Make sure, all transitions lie inside the unicode code range 
            if trigger_set.minimum() < UnicodeInterval.begin or trigger_set.supremum() >= UnicodeInterval.end:
                trigger_set.intersect_with(UnicodeInterval)

            if trigger_set.supremum() > character_value_limit:
                error_msg("Pattern contains character beyond the scope of the buffer element size (%s)\n" \
                          % Setup.get_character_value_limit_str() + \
                          "Please, cut the character range of the regular expression,\n"
                          "adapt \"--buffer-element-size\" or \"--buffer-element-type\",\n"       + \
                          "or specify '--buffer-element-size-irrelevant' to ignore the issue.", fh)

            if Setup.buffer_codec in ["utf16-le", "utf16-be"]:
                # Delete the forbidden interval: D800-DFFF
                if trigger_set.has_intersection(ForbiddenRange):
                    error_msg("Pattern contains characters in unicode range 0xD800-0xDFFF.\n"
                              "This range is not covered by UTF16. Cutting Interval.", fh, DontExitF=True)
                    trigger_set.cut_interval(ForbiddenRange)
            
            # If the operation resulted in cutting the path to the target state, then delete it.
            if trigger_set.is_empty():
                state.transitions().delete_transitions_to_target(target_state_index)
Exemple #5
0
def __prune_trigger_map_to_character_type_domain(trigger_map):

    UpperLimit = Setup.get_character_value_limit()
    LowerLimit = 0

    if UpperLimit == -1: return trigger_map

    new_trigger_map = []
    for entry in trigger_map:
        interval, target = entry

        if interval.end <= LowerLimit: 
            # No character can have a value below zero
            continue
        elif interval.begin > UpperLimit:
            break
        elif interval.end < UpperLimit:
            new_trigger_map.append(entry)
        else:
            # Interval overlaps the end. Thus it is the last and
            # does not need to be checked.
            new_trigger_map.append([Interval(interval.begin, UpperLimit), target])
    return new_trigger_map
Exemple #6
0
def __prune_trigger_map_to_character_type_domain(trigger_map):

    UpperLimit = Setup.get_character_value_limit()
    LowerLimit = 0

    if UpperLimit == -1: return trigger_map

    new_trigger_map = []
    for entry in trigger_map:
        interval, target = entry

        if interval.end <= LowerLimit:
            # No character can have a value below zero
            continue
        elif interval.begin > UpperLimit:
            break
        elif interval.end < UpperLimit:
            new_trigger_map.append(entry)
        else:
            # Interval overlaps the end. Thus it is the last and
            # does not need to be checked.
            new_trigger_map.append(
                [Interval(interval.begin, UpperLimit), target])
    return new_trigger_map
def do(sh):
    """Transforms an expression of the form [a-z0-9A-Z] into a NumberSet of
       code points that corresponds to the characters and character ranges mentioned.
    """
    assert     sh.__class__.__name__ == "StringIO" \
            or sh.__class__.__name__ == "file"

    def __check_letter(stream, letter):
        position = stream.tell()
        if stream.read(1) == letter: return True
        else:                        stream.seek(position); return False

    # check, if the set is thought to be inverse (preceeded by '^')
    tracker = Tracker()

    if __check_letter(sh, "^"): tracker.negation_f = True

    char_code     = None
    quote_checker = DoubleQuoteChecker() # Checks for " appearing twice. Some users did use
    #                                    # constructs such as "-" and ended up in confusing behavior.
    while char_code != 0xFF:
        char_code = utf8.__read_one_utf8_code_from_stream(sh)

        quote_checker.do(char_code)
        if char_code == ord("-"):
            raise RegularExpressionException("Character range operator '-' requires a preceding character as in 'a-z'.")
        elif char_code == 0xFF: 
            raise RegularExpressionException("Missing closing ']' in character range expression.")
        elif char_code == ord("]"):
            break
        elif char_code == ord("\\"):
            char_code = snap_backslashed_character.do(sh)

        if not __check_letter(sh, "-"): 
            # (*) Normal character
            tracker.consider_letter(char_code)
        else:
            # (*) Character range:  'character0' '-' 'character1'
            char_code_2 = utf8.__read_one_utf8_code_from_stream(sh)
            quote_checker.do(char_code_2)
            if char_code_2 in [0xFF, ord(']')]: 
                raise RegularExpressionException("Character range: '-' requires a character following '-'.")
            elif char_code == ord("-"):
                raise RegularExpressionException("Character range operator '-' followed by '-'.")
            elif char_code_2 == ord("\\"): 
                char_code_2 = snap_backslashed_character.do(sh)  

            # value denotes 'end', i.e first character outside the interval => add 1
            if char_code == char_code_2:
                utf8_string = utf8.map_unicode_to_utf8(char_code)
                raise RegularExpressionException("Character range '%s-%s' has only one element.\n" \
                                                 % (utf8_string, utf8_string) + \
                                                 "In this case avoid range expression for clarity.")
            tracker.consider_interval(char_code, char_code_2 + 1)

    if tracker.negation_f: 
        result = tracker.match_set.inverse()
        if Setup.get_character_value_limit() != sys.maxint:
            result.intersect_with(Interval(0, Setup.get_character_value_limit()))
        return result
    else:                  
        return tracker.match_set
Exemple #8
0
def do(sh):
    """Transforms an expression of the form [a-z0-9A-Z] into a NumberSet of
       code points that corresponds to the characters and character ranges mentioned.
    """
    assert     sh.__class__.__name__ == "StringIO" \
            or sh.__class__.__name__ == "file"

    def __check_letter(stream, letter):
        position = stream.tell()
        if stream.read(1) == letter: return True
        else:
            stream.seek(position)
            return False

    # check, if the set is thought to be inverse (preceeded by '^')
    tracker = Tracker()

    if __check_letter(sh, "^"): tracker.negation_f = True

    char_code = None
    quote_checker = DoubleQuoteChecker(
    )  # Checks for " appearing twice. Some users did use
    #                                    # constructs such as "-" and ended up in confusing behavior.
    while char_code != 0xFF:
        char_code = utf8.__read_one_utf8_code_from_stream(sh)

        quote_checker.do(char_code)
        if char_code == ord("-"):
            raise RegularExpressionException(
                "Character range operator '-' requires a preceding character as in 'a-z'."
            )
        elif char_code == 0xFF:
            raise RegularExpressionException(
                "Missing closing ']' in character range expression.")
        elif char_code == ord("]"):
            break
        elif char_code == ord("\\"):
            char_code = snap_backslashed_character.do(sh)

        if not __check_letter(sh, "-"):
            # (*) Normal character
            tracker.consider_letter(char_code)
        else:
            # (*) Character range:  'character0' '-' 'character1'
            char_code_2 = utf8.__read_one_utf8_code_from_stream(sh)
            quote_checker.do(char_code_2)
            if char_code_2 in [0xFF, ord(']')]:
                raise RegularExpressionException(
                    "Character range: '-' requires a character following '-'.")
            elif char_code == ord("-"):
                raise RegularExpressionException(
                    "Character range operator '-' followed by '-'.")
            elif char_code_2 == ord("\\"):
                char_code_2 = snap_backslashed_character.do(sh)

            # value denotes 'end', i.e first character outside the interval => add 1
            if char_code == char_code_2:
                utf8_string = utf8.map_unicode_to_utf8(char_code)
                raise RegularExpressionException("Character range '%s-%s' has only one element.\n" \
                                                 % (utf8_string, utf8_string) + \
                                                 "In this case avoid range expression for clarity.")
            tracker.consider_interval(char_code, char_code_2 + 1)

    if tracker.negation_f:
        result = tracker.match_set.inverse()
        if Setup.get_character_value_limit() != sys.maxint:
            result.intersect_with(
                Interval(0, Setup.get_character_value_limit()))
        return result
    else:
        return tracker.match_set