def snap_set_term(stream, PatternDict): __debug_entry("set_term", stream) operation_list = ["union", "intersection", "difference", "inverse"] character_set_list = special_character_set_db().keys() skip_whitespace(stream) position = stream.tell() # if there is no following '(', then enter the 'snap_expression' block below word = read_identifier(stream) if word in operation_list: set_list = snap_set_list(stream, word, PatternDict) # if an error occurs during set_list parsing, an exception is thrown about syntax error L = len(set_list) result = set_list[0] if word == "inverse": # The inverse of multiple sets, is to be the inverse of the union of these sets. if L > 1: for character_set in set_list[1:]: result.unite_with(character_set) result = result.inverse() if Setup.get_character_value_limit() != -1: result.intersect_with( Interval(0, Setup.get_character_value_limit())) return __debug_exit(result, stream) if L < 2: raise RegularExpressionException("Regular Expression: A %s operation needs at least\n" % word + \ "two sets to operate on them.") if word == "union": for set in set_list[1:]: result.unite_with(set) elif word == "intersection": for set in set_list[1:]: result.intersect_with(set) elif word == "difference": for set in set_list[1:]: result.subtract(set) elif word in character_set_list: result = special_character_set_db()[word] elif word != "": verify_word_in_list(word, character_set_list + operation_list, "Unknown keyword '%s'." % word, stream) else: stream.seek(position) result = snap_set_expression(stream, PatternDict) return __debug_exit(result, stream)
def snap_set_term(stream, PatternDict): __debug_entry("set_term", stream) operation_list = [ "union", "intersection", "difference", "inverse"] character_set_list = special_character_set_db().keys() skip_whitespace(stream) position = stream.tell() # if there is no following '(', then enter the 'snap_expression' block below word = read_identifier(stream) if word in operation_list: set_list = snap_set_list(stream, word, PatternDict) # if an error occurs during set_list parsing, an exception is thrown about syntax error L = len(set_list) result = set_list[0] if word == "inverse": # The inverse of multiple sets, is to be the inverse of the union of these sets. if L > 1: for character_set in set_list[1:]: result.unite_with(character_set) result = result.inverse() if Setup.get_character_value_limit() != -1: result.intersect_with(Interval(0, Setup.get_character_value_limit())) return __debug_exit(result, stream) if L < 2: raise RegularExpressionException("Regular Expression: A %s operation needs at least\n" % word + \ "two sets to operate on them.") if word == "union": for set in set_list[1:]: result.unite_with(set) elif word == "intersection": for set in set_list[1:]: result.intersect_with(set) elif word == "difference": for set in set_list[1:]: result.subtract(set) elif word in character_set_list: result = special_character_set_db()[word] elif word != "": verify_word_in_list(word, character_set_list + operation_list, "Unknown keyword '%s'." % word, stream) else: stream.seek(position) result = snap_set_expression(stream, PatternDict) return __debug_exit(result, stream)
def create_ALL_BUT_NEWLINE_state_machine(): global Setup result = StateMachine() # NOTE: Buffer control characters are supposed to be filtered out by the code # generator. trigger_set = NumberSet(Interval(ord("\n")).inverse()) if Setup.get_character_value_limit() != sys.maxint: trigger_set.intersect_with( Interval(0, Setup.get_character_value_limit())) result.add_transition(result.init_state_index, trigger_set, AcceptanceF=True) return result
def __delete_forbidden_ranges(sm, fh): """Unicode does define all code points >= 0. Thus there can be no code points below zero as it might result from some number set operations. NOTE: This operation might result in orphaned states that have to be deleted. """ global Setup character_value_limit = Setup.get_character_value_limit() for state in sm.states.values(): for target_state_index, trigger_set in state.transitions().get_map().items(): # Make sure, all transitions lie inside the unicode code range if trigger_set.minimum() < UnicodeInterval.begin or trigger_set.supremum() >= UnicodeInterval.end: trigger_set.intersect_with(UnicodeInterval) if trigger_set.supremum() > character_value_limit: error_msg("Pattern contains character beyond the scope of the buffer element size (%s)\n" \ % Setup.get_character_value_limit_str() + \ "Please, cut the character range of the regular expression,\n" "adapt \"--buffer-element-size\" or \"--buffer-element-type\",\n" + \ "or specify '--buffer-element-size-irrelevant' to ignore the issue.", fh) if Setup.buffer_codec in ["utf16-le", "utf16-be"]: # Delete the forbidden interval: D800-DFFF if trigger_set.has_intersection(ForbiddenRange): error_msg("Pattern contains characters in unicode range 0xD800-0xDFFF.\n" "This range is not covered by UTF16. Cutting Interval.", fh, DontExitF=True) trigger_set.cut_interval(ForbiddenRange) # If the operation resulted in cutting the path to the target state, then delete it. if trigger_set.is_empty(): state.transitions().delete_transitions_to_target(target_state_index)
def __prune_trigger_map_to_character_type_domain(trigger_map): UpperLimit = Setup.get_character_value_limit() LowerLimit = 0 if UpperLimit == -1: return trigger_map new_trigger_map = [] for entry in trigger_map: interval, target = entry if interval.end <= LowerLimit: # No character can have a value below zero continue elif interval.begin > UpperLimit: break elif interval.end < UpperLimit: new_trigger_map.append(entry) else: # Interval overlaps the end. Thus it is the last and # does not need to be checked. new_trigger_map.append([Interval(interval.begin, UpperLimit), target]) return new_trigger_map
def __prune_trigger_map_to_character_type_domain(trigger_map): UpperLimit = Setup.get_character_value_limit() LowerLimit = 0 if UpperLimit == -1: return trigger_map new_trigger_map = [] for entry in trigger_map: interval, target = entry if interval.end <= LowerLimit: # No character can have a value below zero continue elif interval.begin > UpperLimit: break elif interval.end < UpperLimit: new_trigger_map.append(entry) else: # Interval overlaps the end. Thus it is the last and # does not need to be checked. new_trigger_map.append( [Interval(interval.begin, UpperLimit), target]) return new_trigger_map
def do(sh): """Transforms an expression of the form [a-z0-9A-Z] into a NumberSet of code points that corresponds to the characters and character ranges mentioned. """ assert sh.__class__.__name__ == "StringIO" \ or sh.__class__.__name__ == "file" def __check_letter(stream, letter): position = stream.tell() if stream.read(1) == letter: return True else: stream.seek(position); return False # check, if the set is thought to be inverse (preceeded by '^') tracker = Tracker() if __check_letter(sh, "^"): tracker.negation_f = True char_code = None quote_checker = DoubleQuoteChecker() # Checks for " appearing twice. Some users did use # # constructs such as "-" and ended up in confusing behavior. while char_code != 0xFF: char_code = utf8.__read_one_utf8_code_from_stream(sh) quote_checker.do(char_code) if char_code == ord("-"): raise RegularExpressionException("Character range operator '-' requires a preceding character as in 'a-z'.") elif char_code == 0xFF: raise RegularExpressionException("Missing closing ']' in character range expression.") elif char_code == ord("]"): break elif char_code == ord("\\"): char_code = snap_backslashed_character.do(sh) if not __check_letter(sh, "-"): # (*) Normal character tracker.consider_letter(char_code) else: # (*) Character range: 'character0' '-' 'character1' char_code_2 = utf8.__read_one_utf8_code_from_stream(sh) quote_checker.do(char_code_2) if char_code_2 in [0xFF, ord(']')]: raise RegularExpressionException("Character range: '-' requires a character following '-'.") elif char_code == ord("-"): raise RegularExpressionException("Character range operator '-' followed by '-'.") elif char_code_2 == ord("\\"): char_code_2 = snap_backslashed_character.do(sh) # value denotes 'end', i.e first character outside the interval => add 1 if char_code == char_code_2: utf8_string = utf8.map_unicode_to_utf8(char_code) raise RegularExpressionException("Character range '%s-%s' has only one element.\n" \ % (utf8_string, utf8_string) + \ "In this case avoid range expression for clarity.") tracker.consider_interval(char_code, char_code_2 + 1) if tracker.negation_f: result = tracker.match_set.inverse() if Setup.get_character_value_limit() != sys.maxint: result.intersect_with(Interval(0, Setup.get_character_value_limit())) return result else: return tracker.match_set
def do(sh): """Transforms an expression of the form [a-z0-9A-Z] into a NumberSet of code points that corresponds to the characters and character ranges mentioned. """ assert sh.__class__.__name__ == "StringIO" \ or sh.__class__.__name__ == "file" def __check_letter(stream, letter): position = stream.tell() if stream.read(1) == letter: return True else: stream.seek(position) return False # check, if the set is thought to be inverse (preceeded by '^') tracker = Tracker() if __check_letter(sh, "^"): tracker.negation_f = True char_code = None quote_checker = DoubleQuoteChecker( ) # Checks for " appearing twice. Some users did use # # constructs such as "-" and ended up in confusing behavior. while char_code != 0xFF: char_code = utf8.__read_one_utf8_code_from_stream(sh) quote_checker.do(char_code) if char_code == ord("-"): raise RegularExpressionException( "Character range operator '-' requires a preceding character as in 'a-z'." ) elif char_code == 0xFF: raise RegularExpressionException( "Missing closing ']' in character range expression.") elif char_code == ord("]"): break elif char_code == ord("\\"): char_code = snap_backslashed_character.do(sh) if not __check_letter(sh, "-"): # (*) Normal character tracker.consider_letter(char_code) else: # (*) Character range: 'character0' '-' 'character1' char_code_2 = utf8.__read_one_utf8_code_from_stream(sh) quote_checker.do(char_code_2) if char_code_2 in [0xFF, ord(']')]: raise RegularExpressionException( "Character range: '-' requires a character following '-'.") elif char_code == ord("-"): raise RegularExpressionException( "Character range operator '-' followed by '-'.") elif char_code_2 == ord("\\"): char_code_2 = snap_backslashed_character.do(sh) # value denotes 'end', i.e first character outside the interval => add 1 if char_code == char_code_2: utf8_string = utf8.map_unicode_to_utf8(char_code) raise RegularExpressionException("Character range '%s-%s' has only one element.\n" \ % (utf8_string, utf8_string) + \ "In this case avoid range expression for clarity.") tracker.consider_interval(char_code, char_code_2 + 1) if tracker.negation_f: result = tracker.match_set.inverse() if Setup.get_character_value_limit() != sys.maxint: result.intersect_with( Interval(0, Setup.get_character_value_limit())) return result else: return tracker.match_set