def snap_set_list(stream, set_operation_name, PatternDict): __debug_entry("set_list", stream) skip_whitespace(stream) if stream.read(1) != "(": raise RegularExpressionException( "Missing opening bracket '%s' operation." % set_operation_name) set_list = [] while 1 + 1 == 2: skip_whitespace(stream) result = snap_set_term(stream, PatternDict) if result is None: raise RegularExpressionException( "Missing set expression list after '%s' operation." % set_operation_name) set_list.append(result) skip_whitespace(stream) tmp = stream.read(1) if tmp != ",": if tmp != ")": stream.seek(-1, 1) raise RegularExpressionException( "Missing closing ')' after after '%s' operation." % set_operation_name) return __debug_exit(set_list, stream)
def get_character_code_sequence(sh): assert sh.__class__.__name__ == "StringIO" \ or sh.__class__.__name__ == "file" # Only \" is a special character '"', any other backslashed character # remains as the sequence 'backslash' + character sequence = [] while 1 + 1 == 2: char_code = utf8.__read_one_utf8_code_from_stream(sh) if char_code is None: raise RegularExpressionException( "End of file reached while parsing quoted string.") elif char_code == ord("\\"): char_code = snap_backslashed_character.do(sh) if char_code is None: raise RegularExpressionException( "Unidentified backslash-sequence in quoted string.") elif char_code == ord('"'): break sequence.append(char_code) return sequence
def __parse_property_expression(stream, PropertyLetter, EqualConditionPossibleF=True): """Parses an expression of the form '\? { X [ = Y] }' where ? = PropertyLetter. If the '=' operator is present then two fields are returned first = left hand side, second = right hand side. Othewise an element is returned. """ assert len(PropertyLetter) == 1 assert type(PropertyLetter) == str assert type(EqualConditionPossibleF) == bool # verify '\?' x = stream.read(2) if x != "\\" + PropertyLetter: raise RegularExpressionException("Unicode property letter '\\%s' expected, received '%s'." % x) skip_whitespace(stream) x = stream.read(1) if x != "{": raise RegularExpressionException("Unicode property '\\%s' not followed by '{'." % PropertyLetter) content = __snap_until(stream, "}") fields = content.split("=") if len(fields) == 0: raise RegularExpressionException("Unicode property expression '\\%s{}' cannot have no content.") if len(fields) > 2: raise RegularExpressionException("Unicode property expression '\\%s' can have at maximum one '='.") if not EqualConditionPossibleF and len(fields) == 2: raise RegularExpressionException("Unicode property expression '\\%s' does not allow '=' conditions") return map(lambda x: x.strip(), fields)
def snap_replacement(stream, PatternDict, StateMachineF=True): """Snaps a predefined pattern from the input string and returns the resulting state machine. """ skip_whitespace(stream) pattern_name = read_identifier(stream) if pattern_name == "": raise RegularExpressionException( "Pattern replacement expression misses identifier after '{'.") skip_whitespace(stream) if not check(stream, "}"): raise RegularExpressionException("Pattern replacement expression misses closing '}' after '%s'." \ % pattern_name) error.verify_word_in_list( pattern_name, PatternDict.keys(), "Specifier '%s' not found in any preceeding 'define { ... }' section." % pattern_name, stream) reference = PatternDict[pattern_name] assert reference.__class__ == PatternShorthand # The replacement may be a state machine or a number set if StateMachineF: # Get a cloned version of state machine state_machine = reference.get_state_machine() assert isinstance(state_machine, DFA) # It is essential that state machines defined as patterns do not # have origins. Otherwise, the optimization of patterns that # contain pattern replacements might get confused and can # not find all optimizations. assert not state_machine.has_specific_acceptance_id() # A state machine, that contains pre- or post- conditions cannot be part # of a replacement. The addition of new post-contexts would mess up the pattern. ## if state_machine.has_pre_or_post_context(): ## error.log("Pre- or post-conditioned pattern was used in replacement.\n" + \ ## "Quex's regular expression grammar does not allow this.", stream) return state_machine else: # Get a cloned version of character set character_set = reference.get_character_set() if character_set is None: error.log( "Replacement in character set expression must be a character set.\n" "Specifier '%s' relates to a pattern state machine." % pattern_name, stream) if character_set.is_empty(): error.log( "Referenced character set '%s' is empty.\nAborted." % pattern_name, stream) return character_set
def __snap_word(stream): try: the_word = read_until_letter(stream, ["("]) except: raise RegularExpressionException("Missing opening bracket.") stream.seek(-1, 1) return the_word.strip()
def snap_set_expression(stream, PatternDict): assert stream.__class__.__name__ == "StringIO" \ or stream.__class__.__name__ == "file" __debug_entry("set_expression", stream) result = snap_property_set(stream) if result is not None: return result x = stream.read(2) if x == "\\C": return snap_case_folded_pattern(stream, PatternDict, NumberSetF=True) elif x == "[:": result = snap_set_term(stream, PatternDict) skip_whitespace(stream) x = stream.read(2) if x != ":]": raise RegularExpressionException("Missing closing ':]' for character set expression.\n" + \ "found: '%s'" % x) elif x[0] == "[": stream.seek(-1, 1) result = traditional_character_set.do(stream) elif x[0] == "{": stream.seek(-1, 1) result = snap_replacement(stream, PatternDict, StateMachineF=False) else: result = None return __debug_exit(result, stream)
def snap_bracketed_expression(stream, PatternDict): position = stream.tell() result = snap_expression(stream, PatternDict) if not check(stream, ")"): stream.seek(position) remainder_txt = stream.readline().replace("\n", "").replace("\r", "") raise RegularExpressionException("Missing closing ')' after expression; found '%s'.\n" % remainder_txt \ + "Note, that patterns end with the first non-quoted whitespace.\n" \ + "Also, closing brackets in quotes do not close a syntax block.") if result is None: length = stream.tell() - position stream.seek(position) raise RegularExpressionException("expression in brackets has invalid syntax '%s'" % \ stream.read(length)) return result
def consider_interval(self, Begin, End): if Begin > End: raise RegularExpressionException("Character range: '-' requires character with 'lower code' to preceed\n" + \ "found range '%s-%s' which corresponds to %i-%i as unicode code points." % \ (utf8.map_unicode_to_utf8(Begin), utf8.map_unicode_to_utf8(End), Begin, End)) self.match_set.add_interval(Interval(Begin, End))
def do(self, CharacterCode): if CharacterCode != ord("\""): return self.quote_n += 1 if self.quote_n != 2: return raise RegularExpressionException( "Character '\"' appears twice in character range [ ... ] expression.\n" "You cannot exempt characters this way. Please, use backslash or\n" "split the character range expression.")
def snap_set_term(stream, PatternDict): global special_character_set_db __debug_entry("set_term", stream) operation_list = ["union", "intersection", "difference", "inverse"] character_set_list = special_character_set_db.keys() skip_whitespace(stream) position = stream.tell() # if there is no following '(', then enter the 'snap_expression' block below word = read_identifier(stream) if word in operation_list: set_list = snap_set_list(stream, word, PatternDict) # if an error occurs during set_list parsing, an exception is thrown about syntax error L = len(set_list) result = set_list[0] if word == "inverse": # The inverse of multiple sets, is to be the inverse of the union of these sets. if L > 1: for character_set in set_list[1:]: result.unite_with(character_set) return __debug_exit( result.get_complement(Setup.buffer_codec.source_set), stream) if L < 2: raise RegularExpressionException("Regular Expression: A %s operation needs at least\n" % word + \ "two sets to operate on them.") if word == "union": for set in set_list[1:]: result.unite_with(set) elif word == "intersection": for set in set_list[1:]: result.intersect_with(set) elif word == "difference": for set in set_list[1:]: result.subtract(set) elif word in character_set_list: reg_expr = special_character_set_db[word] result = traditional_character_set.do_string(reg_expr) elif word != "": error.verify_word_in_list(word, character_set_list + operation_list, "Unknown keyword '%s'." % word, stream) else: stream.seek(position) result = snap_set_expression(stream, PatternDict) return __debug_exit(result, stream)
def __snap_repetition_range(the_state_machine, stream): """Snaps a string that represents a repetition range. The following syntaxes are supported: '?' one or none repetition '+' one or arbitrary repetition '*' arbitrary repetition (even zero) '{n}' exactly 'n' repetitions '{m,n}' from 'm' to 'n' repetitions '{n,}' arbitrary, but at least 'n' repetitions """ assert the_state_machine.__class__.__name__ == "DFA", \ "received object of type '%s'" % the_state_machine.__class__.__name__ + "\n" + \ repr(the_state_machine) position_0 = stream.tell() x = stream.read(1) if x == "+": result = repeat.do(the_state_machine, 1) elif x == "*": result = repeat.do(the_state_machine) elif x == "?": result = repeat.do(the_state_machine, 0, 1) elif x == "{": repetition_range_str = __snap_until(stream, "}") if len(repetition_range_str) and not repetition_range_str[0].isdigit(): # no repetition range, so everything remains as it is stream.seek(position_0) return the_state_machine try: if repetition_range_str.find(",") == -1: # no ',' thus "match exactly a certain number": # e.g. {4} = match exactly four repetitions number = int(repetition_range_str) result = repeat.do(the_state_machine, number, number) return result # a range of numbers is given fields = repetition_range_str.split(",") fields = map(lambda x: x.strip(), fields) number_1 = int(fields[0].strip()) if fields[1] == "": number_2 = -1 # e.g. {2,} else: number_2 = int(fields[1].strip()) # e.g. {2,5} # produce repeated state machine result = repeat.do(the_state_machine, number_1, number_2) return result except: raise RegularExpressionException("error while parsing repetition range expression '%s'" \ % repetition_range_str) else: # no repetition range, so everything remains as it is stream.seek(position_0) return the_state_machine return result
def do(sh, ReducedSetOfBackslashedCharactersF=False): """All backslashed characters shall enter this function. In particular backslashed characters appear in: "$50" -- quoted strings [a-zA-Z] -- character sets for -- lonestanding characters x = string containing characters after 'the backslash' i = position of the backslash in the given string ReducedSetOfBackslashedCharactersF indicates whether we are outside of a quoted string (lonestanding characters, sets, etc.) or inside a string. Inside a quoted string there are different rules, because not all control characters need to be considered. RETURNS: UCS code of the interpreted character, index of first element after the treated characters in the string """ assert sh.__class__ == StringIO or sh.__class__ == file assert type(ReducedSetOfBackslashedCharactersF) == bool if ReducedSetOfBackslashedCharactersF: backslashed_character_list = [ 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', '"' ] else: backslashed_character_list = backslashed_character_db.keys() tmp = sh.read(1) if tmp == "": raise RegularExpressionException("End of file while parsing backslash sequence.") if tmp in backslashed_character_list: return backslashed_character_db[tmp] elif tmp.isdigit(): sh.seek(-1,1); return __parse_octal_number(sh, 5) elif tmp == 'x': return __parse_hex_number(sh, 2) elif tmp == 'X': return __parse_hex_number(sh, 4) elif tmp == 'U': return __parse_hex_number(sh, 6) else: raise RegularExpressionException("Backslashed '%s' is unknown to quex." % tmp)
def __snap_until(stream, ClosingDelimiter, OpeningDelimiter=None): """Cuts the first letters of the utf8_string until an un-backslashed Delimiter occurs. """ cut_string = "" backslash_f = False open_bracket_n = 1 while 1 + 1 == 2: letter = stream.read(1) if letter == "": raise RegularExpressionException( "Unable to find closing delimiter '%s'" % ClosingDelimiter) cut_string += letter if letter == "\\": backslash_f = not backslash_f continue elif letter == ClosingDelimiter and not backslash_f: if open_bracket_n == 1: cut_string = cut_string[:-1] break open_bracket_n -= 1 elif letter == OpeningDelimiter and not backslash_f: # NOTE: if OpeningDelimiter is None, then this can never be the case! open_bracket_n += 1 # if a backslash would have appeared, we would have 'continue'd (see above) backslash_f = False else: raise RegularExpressionException( "Unable to find closing delimiter '%s'" % ClosingDelimiter) return cut_string
def __parse_base_number(sh, MaxL, DigitSet, Base, NumberName): """MaxL = Maximum length of number to be parsed. """ number_str = "" tmp = sh.read(1) while tmp != "" and tmp in DigitSet: number_str += tmp if len(number_str) == MaxL: break tmp = sh.read(1) else: if tmp != "": sh.seek(-1,1) if number_str == "": raise RegularExpressionException("Missing %s number." % NumberName) return long(number_str, Base)
def do_shortcut(stream, ShortcutLetter, PropertyAlias): """Name property shortcut '\ShortcutLetter{...}' which is a shortcut for '\P{PropertyAlias=...}'. Parse an expression of the form '\N{CHARACTER NAME}' and return the related character set of characters that match the given name. Wildcards in are allowed. """ content = __parse_property_expression(stream, ShortcutLetter, EqualConditionPossibleF=False) # if len(content) != 1 then an exception is thrown property_value = content[0] result = ucs_property_db.get_character_set(PropertyAlias, property_value) if type(result) == str: raise RegularExpressionException(result) return result
def do(stream): """Property expression: '\P{...}' Parse an expression of the forms: '\P{property = value}' or '\P{binary_property}' and return the related character set. """ content = __parse_property_expression(stream, "P") # if len(content) < 1 or > 2 then an exception is thrown property_name = content[0] if len(content) == 1: property_value = None else: property_value = content[1] result = ucs_property_db.get_character_set(property_name, property_value) if type(result) == str: raise RegularExpressionException(result) return result
def snap_primary(stream, PatternDict): """primary: " non_double_quote * " = character string [ non_rect_bracket_close ] = set of characters { identifier } = pattern replacement ( expression ) non_control_character+ = lonely characters primary repetition_cmd """ global SPECIAL_TERMINATOR __debug_entry("primary", stream) x = stream.read(1) if x == "": return __debug_exit(None, stream) # -- 'primary' primary if x == "\"": result = snap_character_string.do(stream) elif x == "[": stream.seek(-1, 1) result = snap_character_set_expression(stream, PatternDict) elif x == "{": result = snap_replacement(stream, PatternDict) elif x == ".": result = create_ALL_BUT_NEWLINE_state_machine(stream) elif x == "(": result = snap_bracketed_expression(stream, PatternDict) elif x.isspace(): # a lonestanding space ends the regular expression stream.seek(-1, 1) return __debug_exit(None, stream) elif x in ["*", "+", "?"]: raise RegularExpressionException( "lonely operator '%s' without expression proceeding." % x) elif x == "\\": result = snap_command(stream, PatternDict) if result is None: stream.seek(-1, 1) trigger_set = snap_property_set(stream) if trigger_set is None: # snap the '\' stream.read(1) char_code = snap_backslashed_character.do(stream) if char_code is None: raise RegularExpressionException( "Backslash followed by unrecognized character code.") trigger_set = char_code result = DFA() result.add_transition(result.init_state_index, trigger_set, AcceptanceF=True) elif x not in CONTROL_CHARACTERS and x != SPECIAL_TERMINATOR: # NOTE: The '\' is not inside the control characters---for a reason. # It is used to define for example character codes using '\x' etc. stream.seek(-1, 1) result = snap_non_control_character(stream, PatternDict) else: # NOTE: This includes the '$' sign which means 'end of line' # because the '$' sign is in CONTROL_CHARACTERS, but is not checked # against. Thus, it it good to leave here on '$' because the # '$' sign is handled on the very top level. # this is not a valid primary stream.seek(-1, 1) return __debug_exit(None, stream) # -- optional repetition command? result_repeated = __snap_repetition_range(result, stream) if result_repeated is not None: result = result_repeated return __debug_exit(result, stream)
def do(sh): """Transforms an expression of the form [a-z0-9A-Z] into a NumberSet of code points that corresponds to the characters and character ranges mentioned. """ assert sh.__class__.__name__ == "StringIO" \ or sh.__class__.__name__ == "file" def __check_letter(stream, letter): position = stream.tell() if stream.read(1) == letter: return True else: stream.seek(position) return False # check, if the set is thought to be inverse (preceeded by '^') tracker = Tracker() if __check_letter(sh, "^"): tracker.negation_f = True char_code = None quote_checker = DoubleQuoteChecker( ) # Checks for " appearing twice. Some users did use # # constructs such as "-" and ended up in confusing behavior. while 1 + 1 == 2: char_code = utf8.__read_one_utf8_code_from_stream(sh) quote_checker.do(char_code) if char_code == ord("-"): raise RegularExpressionException( "Character range operator '-' requires a preceding character as in 'a-z'." ) elif char_code is None: raise RegularExpressionException( "Missing closing ']' in character range expression.") elif char_code == ord("]"): break elif char_code == ord("\\"): char_code = snap_backslashed_character.do(sh) if not __check_letter(sh, "-"): # (*) Normal character tracker.consider_letter(char_code) else: # (*) Character range: 'character0' '-' 'character1' char_code_2 = utf8.__read_one_utf8_code_from_stream(sh) quote_checker.do(char_code_2) if char_code_2 in [None, ord(']')]: raise RegularExpressionException( "Character range: '-' requires a character following '-'.") elif char_code == ord("-"): raise RegularExpressionException( "Character range operator '-' followed by '-'.") elif char_code_2 == ord("\\"): char_code_2 = snap_backslashed_character.do(sh) # value denotes 'end', i.e first character outside the interval => add 1 if char_code == char_code_2: utf8_string = utf8.map_unicode_to_utf8(char_code) raise RegularExpressionException("Character range '%s-%s' has only one element.\n" \ % (utf8_string, utf8_string) + \ "In this case avoid range expression for clarity.") tracker.consider_interval(char_code, char_code_2 + 1) if char_code is None: break if tracker.negation_f: return tracker.match_set.get_complement( Setup.buffer_encoding.source_set) else: return tracker.match_set