def argv_is_query_option(Cl, Option, Name, PrevQueryF): """Determines whether the setup parameter is a parameter related to queries (or to code generation). If a mixed usage is detected an error is issued. RETURN: query flag The query flag is the same as QueryF, except for one case: when QueryF was None (unset) and the option appeared on the command line. Then, the return value tells whether the option was a query flag or not. ERROR: If there are mixed options, i.e. query flags and code generation flags appear at the same time. """ query_f = (Name.find("query_") == 0) if PrevQueryF is None: return query_f elif PrevQueryF == query_f: return query_f # If debug exception is enabled, do not trigger errror if Cl.search(SETUP_INFO["_debug_exception_f"][0]): return query_f error_msg("Mixed options: query and code generation mode.\n" "The option(s) '%s' cannot be combined with preceeding options." \ % str(SETUP_INFO[Name][0])[1:-1].replace("'",""))
def parse(fh): """This function parses a mode description and enters it into the 'blackboard.mode_description_db'. Once all modes are parsed they can be translated into 'real' modes and are located in 'blackboard.mode_db'. """ # NOTE: Catching of EOF happens in caller: parse_section(...) skip_whitespace(fh) mode_name = read_identifier(fh, OnMissingStr="Missing identifier at beginning of mode definition.") # NOTE: constructor does register this mode in the mode_db new_mode = ModeDescription(mode_name, SourceRef.from_FileHandle(fh)) # (*) inherited modes / option_db skip_whitespace(fh) dummy = fh.read(1) if dummy not in [":", "{"]: error_msg("missing ':' or '{' after mode '%s'" % mode_name, fh) if dummy == ":": __parse_option_list(new_mode, fh) # (*) read in pattern-action pairs and events while __parse_element(new_mode, fh): pass
def utf8_to_unicode(ByteSequence): """Unfortunately, there is no elegant way to do the utf8-decoding safely in libPython, since due to strange behavior of a python narrow build a character >= 0x10000 may appear as a 2 byte string and cannot be handled by 'ord' in python 2.x. Thus: utf8d = codecs.getdecoder("utf-8") return ord(utf8d("".join(map(chr, ByteSequence)))[0]) would be unsafe. That's why we do it by hand here """ # Assume that the byte sequence is valid, thus a byte sequence of length 'N' # has a N - 1 leading ones in the header plus a zero. Remaining bits in the # header are therefore 8 - N. All other bytes in the sequence start with bits '10' # and contain 6 bits of useful payload. header_bit_n = 8 - len(ByteSequence) mask = (1 << header_bit_n) - 1 value = ByteSequence[0] & mask for byte in ByteSequence[1:]: value <<= 6 value |= (byte & 0x3F) # blend off the highest two bits # The highest two bits in a follow byte in utf8 MUST be '10'. Thus: if (byte & 0xC0) != 0x80: error_msg("Error in UTF8 encoded file. Inadmissible byte sequence detected. Found byte '%02X'" % byte) return value
def __start_mode(applicable_mode_name_list, mode_name_list): """If more then one mode is defined, then that requires an explicit definition 'start = mode'. """ assert len(applicable_mode_name_list) != 0 start_mode = blackboard.initial_mode.get_pure_code() if start_mode == "": # Choose an applicable mode as start mode start_mode = applicable_mode_name_list[0] blackboard.initial_mode = CodeFragment(start_mode) if len(applicable_mode_name_list) > 1: error_msg("No initial mode defined via 'start' while more than one applicable mode exists.\n" + \ "Use for example 'start = %s;' in the quex source file to define an initial mode." \ % start_mode) # This Branch: start mode is applicable and present else: FileName = blackboard.initial_mode.filename LineN = blackboard.initial_mode.line_n # Start mode present and applicable? verify_word_in_list(start_mode, mode_name_list, "Start mode '%s' is not defined." % start_mode, FileName, LineN) verify_word_in_list( start_mode, applicable_mode_name_list, "Start mode '%s' is inheritable only and cannot be instantiated." % start_mode, FileName, LineN)
def parse(fh, CodeFragmentName, ErrorOnFailureF=True, AllowBriefTokenSenderF=True, ContinueF=True): """RETURNS: An object of class UserCodeFragment containing line number, filename, and the code fragment. None in case of failure. """ assert Setup.__class__ == QuexSetup assert type(ErrorOnFailureF) == bool assert type(AllowBriefTokenSenderF) == bool skip_whitespace(fh) word = fh.read(2) if len(word) >= 1 and word[0] == "{": fh.seek(-1, 1) # unput the second character return __parse_normal(fh, CodeFragmentName) elif AllowBriefTokenSenderF and word == "=>": return __parse_brief_token_sender(fh, ContinueF) elif not ErrorOnFailureF: fh.seek(-2,1) return None else: error_msg("Missing code fragment after %s definition." % CodeFragmentName, fh)
def __parse_brief_token_sender(fh, ContinueF): # shorthand for { self.send(TKN_SOMETHING); QUEX_SETTING_AFTER_SEND_CONTINUE_OR_RETURN(); } LanguageDB = Setup.language_db position = fh.tell() line_n = get_current_line_info_number(fh) + 1 try: skip_whitespace(fh) position = fh.tell() code = __parse_token_id_specification_by_character_code(fh) if code != -1: code = __create_token_sender_by_character_code(fh, code) else: skip_whitespace(fh) identifier = __read_token_identifier(fh) skip_whitespace(fh) if identifier in ["GOTO", "GOSUB", "GOUP"]: code = __create_mode_transition_and_token_sender(fh, identifier) else: code = __create_token_sender_by_token_name(fh, identifier) check_or_die(fh, ";") if code != "": if ContinueF: code += "QUEX_SETTING_AFTER_SEND_CONTINUE_OR_RETURN();\n" return UserCodeFragment(code, fh.name, line_n, LanguageDB) else: return None except EndOfStreamException: fh.seek(position) error_msg("End of file reached while parsing token shortcut.", fh)
def __parse(fh, result, IndentationSetupF=False): """Parses pattern definitions of the form: [ \t] => grid 4; [:intersection([:alpha:], [\X064-\X066]):] => space 1; In other words the right hand side *must* be a character set. """ # NOTE: Catching of EOF happens in caller: parse_section(...) # while 1 + 1 == 2: skip_whitespace(fh) if check(fh, ">"): break # A regular expression state machine pattern, identifier, sr = __parse_definition_head(fh, result) if pattern is None and IndentationSetupF: error_msg("Keyword '\\else' cannot be used in indentation setup.", fh) # '__parse_definition_head()' ensures that only identifiers mentioned in # 'result' are accepted. if not IndentationSetupF: value = read_value_specifier(fh, identifier, 1) result.specify(identifier, pattern, value, sr) else: result.specify(identifier, pattern, sr) if not check(fh, ";"): error_msg("Missing ';' after '%s' specification." % identifier, fh) return result
def parse_section(fh, descriptor, already_defined_list): pos = fh.tell() try: return __parse_section(fh, descriptor, already_defined_list) except EndOfStreamException: fh.seek(pos) error_msg("End of file reached while parsing token_type section.", fh)
def __parse_brief_token_sender(fh, ContinueF): # shorthand for { self.send(TKN_SOMETHING); QUEX_SETTING_AFTER_SEND_CONTINUE_OR_RETURN(); } LanguageDB = Setup.language_db position = fh.tell() line_n = get_current_line_info_number(fh) + 1 try: skip_whitespace(fh) position = fh.tell() code = __parse_token_id_specification_by_character_code(fh) if code != -1: code = __create_token_sender_by_character_code(fh, code) else: skip_whitespace(fh) identifier = __read_token_identifier(fh) skip_whitespace(fh) if identifier in ["GOTO", "GOSUB", "GOUP"]: code = __create_mode_transition_and_token_sender( fh, identifier) else: code = __create_token_sender_by_token_name(fh, identifier) check_or_die(fh, ";") if code != "": if ContinueF: code += "QUEX_SETTING_AFTER_SEND_CONTINUE_OR_RETURN();\n" return UserCodeFragment(code, fh.name, line_n, LanguageDB) else: return None except EndOfStreamException: fh.seek(position) error_msg("End of file reached while parsing token shortcut.", fh)
def __error_character_set_intersection(Before): error_msg("Character set specification '%s' intersects" % Name, FH, DontExitF=True, WarningF=False) error_msg("with definition for '%s' at this place." % Before.name, Before.file_name, Before.line_n)
def seal(self): if len(self.space_db) == 0 and len(self.grid_db) == 0: default_space = ord(' ') default_tab = ord('\t') bad = self.bad_character_set if bad.get().contains(default_space) == False: self.specify_space("[ ]", NumberSet(default_space), 1, self.fh) if bad.get().contains(default_tab) == False: self.specify_grid("[\\t]", NumberSet(default_tab), 4, self.fh) if len(self.space_db) == 0 and len(self.grid_db) == 0: error_msg( "No space or grid defined for indentation counting. Default\n" "values ' ' and '\\t' could not be used since they are specified as 'bad'.", bad.file_name, bad.line_n) if self.newline_state_machine.get() is None: sm = StateMachine() end_idx = sm.add_transition(sm.init_state_index, NumberSet(ord('\n')), AcceptanceF=True) mid_idx = sm.add_transition(sm.init_state_index, NumberSet(ord('\r')), AcceptanceF=False) sm.add_transition(mid_idx, NumberSet(ord('\n')), end_idx, AcceptanceF=False) self.specify_newline("(\\r\\n)|(\\n)", sm, self.fh)
def parse_standard_members(fh, section_name, descriptor, already_defined_list): if not check(fh, "{"): error_msg( "Missing opening '{' at begin of token_type section '%s'." % section_name, fh) position = fh.tell() while 1 + 1 == 2: try: result = parse_variable_definition(fh) except EndOfStreamException: fh.seek(position) error_eof("standard", fh) if result is None: return type_code_fragment, name = result[0], result[1] __validate_definition(type_code_fragment, name, already_defined_list, StandardMembersF=True) if name == "id": descriptor.token_id_type = type_code_fragment elif name == "column_number": descriptor.column_number_type = type_code_fragment elif name == "line_number": descriptor.line_number_type = type_code_fragment else: assert False # This should have been caught by the variable parser function already_defined_list.append([name, type_code_fragment])
def __delete_forbidden_ranges(sm, fh): """Unicode does define all code points >= 0. Thus there can be no code points below zero as it might result from some number set operations. NOTE: This operation might result in orphaned states that have to be deleted. """ global Setup character_value_limit = Setup.get_character_value_limit() for state in sm.states.values(): for target_state_index, trigger_set in state.transitions().get_map().items(): # Make sure, all transitions lie inside the unicode code range if trigger_set.minimum() < UnicodeInterval.begin or trigger_set.supremum() >= UnicodeInterval.end: trigger_set.intersect_with(UnicodeInterval) if trigger_set.supremum() > character_value_limit: error_msg("Pattern contains character beyond the scope of the buffer element size (%s)\n" \ % Setup.get_character_value_limit_str() + \ "Please, cut the character range of the regular expression,\n" "adapt \"--buffer-element-size\" or \"--buffer-element-type\",\n" + \ "or specify '--buffer-element-size-irrelevant' to ignore the issue.", fh) if Setup.buffer_codec in ["utf16-le", "utf16-be"]: # Delete the forbidden interval: D800-DFFF if trigger_set.has_intersection(ForbiddenRange): error_msg("Pattern contains characters in unicode range 0xD800-0xDFFF.\n" "This range is not covered by UTF16. Cutting Interval.", fh, DontExitF=True) trigger_set.cut_interval(ForbiddenRange) # If the operation resulted in cutting the path to the target state, then delete it. if trigger_set.is_empty(): state.transitions().delete_transitions_to_target(target_state_index)
def parse_standard_members(fh, section_name, descriptor, already_defined_list): if not check(fh, "{"): error_msg("Missing opening '{' at begin of token_type section '%s'." % section_name, fh); position = fh.tell() while 1 + 1 == 2: try: result = parse_variable_definition(fh) except EndOfStreamException: fh.seek(position) error_eof("standard", fh) if result is None: return type_code_fragment, name = result[0], result[1] __validate_definition(type_code_fragment, name, already_defined_list, StandardMembersF=True) if name == "id": descriptor.token_id_type = type_code_fragment elif name == "column_number": descriptor.column_number_type = type_code_fragment elif name == "line_number": descriptor.line_number_type = type_code_fragment else: assert False # This should have been caught by the variable parser function already_defined_list.append([name, type_code_fragment])
def __check_file_name(setup, Candidate, Name): value = setup.__dict__[Candidate] CommandLineOption = command_line_args(Candidate) if value == "": return if type(value) == list: for name in value: if name != "" and name[0] == "-": error_msg("Quex refuses to work with file names that start with '-' (minus).\n" + \ "Received '%s' for %s (%s)" % (value, name, repr(CommandLineOption)[1:-1])) if os.access(name, os.F_OK) == False: # error_msg("File %s (%s)\ncannot be found." % (name, Name)) error_msg_file_not_found(name, Name) else: if value == "" or value[0] == "-": return if os.access(value, os.F_OK): return if os.access(QUEX_PATH + "/" + value, os.F_OK): return if os.access(os.path.dirname(value), os.F_OK) == False \ and os.access(QUEX_PATH + "/" + os.path.dirname(value), os.F_OK) == False: error_msg("File '%s' is supposed to be located in directory '%s' or\n" % \ (os.path.basename(value), os.path.dirname(value)) + \ "'%s'. No such directories exist." % \ (QUEX_PATH + "/" + os.path.dirname(value))) error_msg_file_not_found(value, Name)
def __handle_property_match(cl): property_follower = Setup.query_property_match if not property_follower: return sys.stderr.write("(please, wait for database parsing to complete)\n") fields = map(lambda x: x.strip(), property_follower.split("=")) if len(fields) != 2: error_msg("Wrong property setting '%s'." % property_follower) # -- determine name and value name = fields[0] wild_card_expression = fields[1] # -- get the property from the database property = __get_property(name) if property is None: return True # -- find the character set for the given expression if property.type == "Binary": error_msg( "Binary property '%s' is not subject to value wild card matching.\n" % property.name) for value in property.get_wildcard_value_matches(wild_card_expression): print value
def __general_validate(fh, Mode, Name, pos): if Name == "on_indentation": fh.seek(pos) error_msg("Definition of 'on_indentation' is no longer supported since version 0.51.1.\n" "Please, use 'on_indent' for the event of an opening indentation, 'on_dedent'\n" "for closing indentation, and 'on_nodent' for no change in indentation.", fh) def error_dedent_and_ndedent(code, A, B): filename = "(unknown)" line_n = "0" if hasattr(code, "filename"): filename = code.filename if hasattr(code, "line_n"): line_n = code.line_n error_msg("Indentation event handler '%s' cannot be defined, because\n" % A, fh, DontExitF=True, WarningF=False) error_msg("the alternative '%s' has already been defined." % B, filename, line_n) if Name == "on_dedent" and Mode.events.has_key("on_n_dedent"): fh.seek(pos) code = Mode.events["on_n_dedent"] if code.get_code() != "": error_dedent_and_ndedent(code, "on_dedent", "on_n_dedent") if Name == "on_n_dedent" and Mode.events.has_key("on_dedent"): fh.seek(pos) code = Mode.events["on_dedent"] if code.get_code() != "": error_dedent_and_ndedent(code, "on_n_dedent", "on_dedent")
def get_character_value_limit(self): """A buffer element is a chunk of memory of the size of the granularity of which the input pointer increases. For fixed size codecs, such as ASCII or UCS32, the BUFFER ELEMENT VALUE LIMIT is exactly the same as the CHARACTER VALUE LIMIT. However, for dynamic sized codecs, such as UTF8 or UTF16, they are different. In UTF8, the input pointer increments by one byte on each state transition. However, a character may consist out of multiple bytes. The buffer element value limit is 256, but the character value limit is the whole range. RETURNS: Integer = supremum of possible character range, i.e. one character behind the last possible. sys.maxint, if no such limit exists. """ buffer_element_size = self.buffer_element_size if buffer_element_size == -1: return sys.maxint try: result = 256 ** buffer_element_size except: file_in.error_msg("Error while trying to compute 256 to the 'buffer-element-size' (%i bytes)\n" \ % buffer_element_size + \ "Adapt \"--buffer-element-size\" or \"--buffer-element-type\",\n" + \ "or specify '--buffer-element-size-irrelevant' to ignore the issue.") if result > sys.maxint: return sys.maxint else: return result
def buffer_codec_prepare(self, BufferCodecName, BufferCodecFileName=None, Module=None): """Determines: Setup.buffer_codec_name Setup.buffer_codec """ if BufferCodecName in ("utf8", "utf16"): assert Module is not None result = codec_db.CodecDynamicInfo(BufferCodecName, Module) elif BufferCodecFileName: os.path.splitext(os.path.basename(BufferCodecFileName)) try: os.path.splitext(os.path.basename(BufferCodecFileName)) except: file_in.error_msg("cannot interpret string following '--codec-file'") result = codec_db.CodecTransformationInfo(FileName=BufferCodecFileName) elif BufferCodecName == "unicode": # (Still, 'icu' or 'iconv' may provide converted content, but ...) # If the internal buffer is 'unicode', then the pattern's state # machines are not converted. The requirement for the pattern's # range is the same as for the 'buffer element chunks'. result = codec_db.CodecInfo("unicode", NumberSet.from_range(0, self.get_character_value_limit()), NumberSet.from_range(0, self.get_character_value_limit())) elif BufferCodecName == "unit-test": result = codec_db.CodecInfo("unicode", NumberSet.from_range(-sys.maxint, sys.maxint), NumberSet.from_range(-sys.maxint, sys.maxint)) else: result = codec_db.CodecTransformationInfo(BufferCodecName) self.buffer_codec = result
def __parse_event(new_mode, fh, word): pos = fh.tell() # Allow '<<EOF>>' and '<<FAIL>>' out of respect for classical tools like 'lex' if word == "<<EOF>>": word = "on_end_of_stream" elif word == "<<FAIL>>": word = "on_failure" elif word in blackboard.all_section_title_list: error_msg("Pattern '%s' is a quex section title. Has the closing '}' of mode %s \n" % (word, new_mode.name) \ + "been forgotten? Else use quotes, i.e. \"%s\"." % word, fh) elif len(word) < 3 or word[:3] != "on_": return False comment = "Unknown event handler '%s'. \n" % word + \ "Note, that any pattern starting with 'on_' is considered an event handler.\n" + \ "use double quotes to bracket patterns that start with 'on_'." __general_validate(fh, new_mode, word, pos) verify_word_in_list(word, event_handler_db.keys(), comment, fh) __validate_required_token_policy_queue(word, fh, pos) continue_f = True if word == "on_end_of_stream": # When a termination token is sent, no other token shall follow. # => Enforce return from the analyzer! Do not allow CONTINUE! continue_f = False new_mode.events[word] = code_fragment.parse(fh, "%s::%s event handler" % (new_mode.name, word), ContinueF=continue_f) return True
def __general_validate(fh, Mode, Name, pos): if Name == "on_indentation": fh.seek(pos) error_msg( "Definition of 'on_indentation' is no longer supported since version 0.51.1.\n" "Please, use 'on_indent' for the event of an opening indentation, 'on_dedent'\n" "for closing indentation, and 'on_nodent' for no change in indentation.", fh) def error_dedent_and_ndedent(code, A, B): filename = "(unknown)" line_n = "0" if hasattr(code, "filename"): filename = code.filename if hasattr(code, "line_n"): line_n = code.line_n error_msg( "Indentation event handler '%s' cannot be defined, because\n" % A, fh, DontExitF=True, WarningF=False) error_msg("the alternative '%s' has already been defined." % B, filename, line_n) if Name == "on_dedent" and Mode.events.has_key("on_n_dedent"): fh.seek(pos) code = Mode.events["on_n_dedent"] if code.get_code() != "": error_dedent_and_ndedent(code, "on_dedent", "on_n_dedent") if Name == "on_n_dedent" and Mode.events.has_key("on_dedent"): fh.seek(pos) code = Mode.events["on_dedent"] if code.get_code() != "": error_dedent_and_ndedent(code, "on_n_dedent", "on_dedent")
def __warn_on_double_definition(): """Double check that no token id appears twice. Again, this can only happen, if quex itself produced the numeric values for the token. If the token ids come from outside, Quex does not know the numeric value. It cannot warn about double definitions. """ assert len(Setup.token_id_foreign_definition_file) == 0 clash_db = defaultdict(list) token_list = token_id_db.values() for i, x in enumerate(token_list): for y in token_list[i+1:]: if x.number != y.number: continue clash_db[x.number].append(x) clash_db[x.number].append(y) def find_source_reference(TokenList): for token in TokenList: if token.sr.is_void(): continue return token.sr return None if len(clash_db) != 0: item_list = clash_db.items() item_list.sort() sr = find_source_reference(item_list[0][1]) error_msg("Following token ids have the same numeric value assigned:", sr, DontExitF=True) for x, token_id_list in item_list: sr = find_source_reference(token_id_list) token_ids_sorted = sorted(list(set(token_id_list)), key=attrgetter("name")) # Ensure uniqueness error_msg(" %s: %s" % (x, "".join(["%s, " % t.name for t in token_ids_sorted])), sr, DontExitF=True)
def detect_path_of_nothing_is_necessary(sm, Name, PostContextPresentF, fh): assert Name in ["", "pre-context", "post-context"] if sm is None: return elif not sm.get_init_state().is_acceptance(): return if len(Name) == 0: name_str = "core pattern" else: name_str = Name msg = "The %s contains in a 'nothing is necessary' path in the state machine.\n" \ % name_str + \ "This means, that without reading a character the analyzer drops into\n" + \ "an acceptance state. " msg += { "": "The analyzer would then stall.", "pre-context": "E.g., pattern 'x*/y/' means that zero or more 'x' are a pre-\n" + \ "condition for 'y'. If zero appearances of 'x' are enough, then obviously\n" + \ "there is no pre-context for 'y'! Most likely the author intended 'x+/y/'.", "post-context": "A post context where nothing is necessary is superfluous.", }[Name] if Name != "post-context" and PostContextPresentF: msg += "\n" \ "Note: A post context does not change anything to that fact." error_msg(msg, fh)
def parse(fh, CodeFragmentName, ErrorOnFailureF=True, AllowBriefTokenSenderF=True, ContinueF=True): """RETURNS: An object of class CodeUser containing line number, filename, and the code fragment. None in case of failure. """ assert type(ErrorOnFailureF) == bool assert type(AllowBriefTokenSenderF) == bool skip_whitespace(fh) word = fh.read(2) if len(word) >= 1 and word[0] == "{": if len(word) > 1: fh.seek(-1, 1) # unput the second character return __parse_normal(fh, CodeFragmentName) elif AllowBriefTokenSenderF and word == "=>": return __parse_brief_token_sender(fh, ContinueF) elif not ErrorOnFailureF: fh.seek(-2, 1) return None else: error_msg( "Missing code fragment after %s definition." % CodeFragmentName, fh)
def __parse_base_mode_list(fh, new_mode): new_mode.base_modes = [] trailing_comma_f = False while 1 + 1 == 2: if check(fh, "{"): fh.seek(-1, 1); break elif check(fh, "<"): fh.seek(-1, 1); break skip_whitespace(fh) identifier = read_identifier(fh) if identifier == "": break new_mode.base_modes.append(identifier) trailing_comma_f = False if not check(fh, ","): break trailing_comma_f = True if trailing_comma_f: error_msg("Trailing ',' after base mode '%s'." % new_mode.base_modes[-1], fh, DontExitF=True, WarningF=True) elif len(new_mode.base_modes) != 0: # This check is a 'service' -- for those who follow the old convention pos = fh.tell() skip_whitespace(fh) dummy_identifier = read_identifier(fh) if dummy_identifier != "": error_msg("Missing separating ',' between base modes '%s' and '%s'.\n" \ % (new_mode.base_modes[-1], dummy_identifier) + \ "(The comma separator is mandatory since quex 0.53.1)", fh) fh.seek(pos)
def __start_mode(applicable_mode_name_list, mode_name_list): """If more then one mode is defined, then that requires an explicit definition 'start = mode'. """ assert len(applicable_mode_name_list) != 0 start_mode = blackboard.initial_mode.get_pure_code() if start_mode == "": # Choose an applicable mode as start mode start_mode = applicable_mode_name_list[0] blackboard.initial_mode = CodeFragment(start_mode) if len(applicable_mode_name_list) > 1: error_msg("No initial mode defined via 'start' while more than one applicable mode exists.\n" + \ "Use for example 'start = %s;' in the quex source file to define an initial mode." \ % start_mode) # This Branch: start mode is applicable and present else: FileName = blackboard.initial_mode.filename LineN = blackboard.initial_mode.line_n # Start mode present and applicable? verify_word_in_list(start_mode, mode_name_list, "Start mode '%s' is not defined." % start_mode, FileName, LineN) verify_word_in_list(start_mode, applicable_mode_name_list, "Start mode '%s' is inheritable only and cannot be instantiated." % start_mode, FileName, LineN)
def __post_process(fh, StartPosition, object, ReturnRE_StringF): assert object is None \ or isinstance(object, Pattern) \ or isinstance(object, StateMachine) \ or isinstance(object, NumberSet) if isinstance(fh, StringIO): regular_expression = "" else: end_position = fh.tell() fh.seek(StartPosition) regular_expression = fh.read(end_position - StartPosition) if regular_expression == "": regular_expression = fh.read(1) fh.seek(-1, 1) # (*) error in regular expression? if object is None: error_msg( "No valid regular expression detected, found '%s'." % regular_expression, fh) # NOT: Do not transform here, since transformation might happen twice when patterns # are defined and when they are replaced. if ReturnRE_StringF: return regular_expression, object else: return object
def parse(fh): """This function parses a mode description and enters it into the 'blackboard.mode_description_db'. Once all modes are parsed they can be translated into 'real' modes and are located in 'blackboard.mode_db'. """ # NOTE: Catching of EOF happens in caller: parse_section(...) skip_whitespace(fh) mode_name = read_identifier( fh, OnMissingStr="Missing identifier at beginning of mode definition.") # NOTE: constructor does register this mode in the mode_db new_mode = ModeDescription(mode_name, SourceRef.from_FileHandle(fh)) # (*) inherited modes / option_db skip_whitespace(fh) dummy = fh.read(1) if dummy not in [":", "{"]: error_msg("missing ':' or '{' after mode '%s'" % mode_name, fh) if dummy == ":": __parse_option_list(new_mode, fh) # (*) read in pattern-action pairs and events while __parse_element(new_mode, fh): pass
def unicode_to_utf8(UnicodeValue): if UnicodeValue < 0x80: return [ UnicodeValue, ] elif UnicodeValue < 0x800: # Bits: 5 + 6 return [ 0xC0 | ((UnicodeValue >> 6) & 0x1F), 0x80 | ((UnicodeValue ) & 0x3F)] elif UnicodeValue < 0x10000: # Bits: 4 + 6 + 6 return [ 0xE0 | ((UnicodeValue >> 12) & 0x0F), 0x80 | ((UnicodeValue >> 6) & 0x3F), 0x80 | ((UnicodeValue ) & 0x3F)] elif UnicodeValue < 0x00200000: # Bits: 3 + 6 + 6 + 6 return [ 0xF0 | ((UnicodeValue >> 18) & 0x07), 0x80 | ((UnicodeValue >> 12) & 0x3F), 0x80 | ((UnicodeValue >> 6) & 0x3F), 0x80 | ((UnicodeValue ) & 0x3F)] elif UnicodeValue < 0x04000000L: # Bits: 2 + 6 + 6 + 6 + 6 return [ 0xF0 | ((UnicodeValue >> 24) & 0x03), 0x80 | ((UnicodeValue >> 18) & 0x3F), 0x80 | ((UnicodeValue >> 12) & 0x3F), 0x80 | ((UnicodeValue >> 6) & 0x3F), 0x80 | ((UnicodeValue ) & 0x3F)] elif UnicodeValue < 0x80000000L: # Bits: 1 + 6 + 6 + 6 + 6 + 6 return [ 0xF0 | ((UnicodeValue >> 30) & 0x01), 0x80 | ((UnicodeValue >> 24) & 0x3F), 0x80 | ((UnicodeValue >> 18) & 0x3F), 0x80 | ((UnicodeValue >> 12) & 0x3F), 0x80 | ((UnicodeValue >> 6) & 0x3F), 0x80 | ((UnicodeValue ) & 0x3F)] else: error_msg("Unicode character 0x%8X > 0x7FFFFFFF detected. Cannot be handled." % UnicodeValue)
def __handle_property_match(cl): property_follower = cl.follow("", "--property-match") sys.stderr.write("(please, wait for database parsing to complete)\n") if property_follower == "": return fields = map(lambda x: x.strip(), property_follower.split("=")) if len(fields) != 2: error_msg("Wrong property setting '%s'." % property_follower) # -- determine name and value name = fields[0] wild_card_expression = fields[1] # -- get the property from the database property = __get_property(name) if property is None: return True # -- find the character set for the given expression if property.type == "Binary": error_msg("Binary property '%s' is not subject to value wild card matching.\n" % property.name) for value in property.get_wildcard_value_matches(wild_card_expression): print value
def token_id_db_verify_or_enter_token_id(fh, TokenName): global Setup prefix_less_TokenName = cut_token_id_prefix(TokenName, fh) # Occasionally add token id automatically to database if not blackboard.token_id_db.has_key(prefix_less_TokenName): # DO NOT ENFORCE THE TOKEN ID TO BE DEFINED, BECAUSE WHEN THE TOKEN ID # IS DEFINED IN C-CODE, THE IDENTIFICATION IS NOT 100% SAFE. if TokenName in blackboard.token_id_db.keys(): msg = "Token id '%s' defined implicitly.\n" % TokenName msg += "'%s' has been defined in a token { ... } section!\n" % \ (Setup.token_id_prefix + TokenName) msg += "Token ids in the token { ... } section are automatically prefixed." error_msg(msg, fh, DontExitF=True, SuppressCode=NotificationDB. warning_usage_of_undefined_token_id_name) else: # Warning is posted later when all implicit tokens have been # collected. See "token_id_maker.__propose_implicit_token_definitions()" blackboard.token_id_implicit_list.append( (prefix_less_TokenName, SourceRef.from_FileHandle(fh))) # Enter the implicit token id definition in the database blackboard.token_id_db[prefix_less_TokenName] = \ TokenInfo(prefix_less_TokenName, None, None, SourceRef.from_FileHandle(fh))
def __check_on_orphan_states(Place, sm): orphan_state_list = sm.get_orphaned_state_index_list() if len(orphan_state_list) == 0: return error_msg("After '%s'" % Place + "\n" + \ "Orphaned state(s) detected in regular expression (optimization lack).\n" + \ "Please, log a defect at the projects website quex.sourceforge.net.\n" + \ "Orphan state(s) = " + repr(orphan_state_list))
def __sm_newline_default(self): """Default newline: '(\n)|(\r\n)' """ global cc_type_name_db newline_set = NumberSet(ord('\n')) retour_set = NumberSet(ord('\r')) before = self.count_command_map.find_occupier(newline_set, set()) if before is not None: error_msg("Trying to implement default newline: '\\n' or '\\r\\n'.\n" "The '\\n' option is not possible, since it has been occupied by '%s'.\n" \ "No newline can be defined by default." % cc_type_name_db[before.cc_type], before.sr, DontExitF=True, SuppressCode=NotificationDB.warning_default_newline_0A_impossible) # In this case, no newline can be defined! return sm = StateMachine.from_character_set(newline_set) if Setup.dos_carriage_return_newline_f: before = self.count_command_map.find_occupier(retour_set, set()) if before is not None: error_msg("Trying to implement default newline: '\\n' or '\\r\\n'.\n" "The '\\r\\n' option is not possible, since '\\r' has been occupied by '%s'." \ % cc_type_name_db[before.cc_type], before.sr, DontExitF=True, SuppressCode=NotificationDB.warning_default_newline_0D_impossible) else: sm.add_transition_sequence(sm.init_state_index, [retour_set, newline_set]) return sm
def utf8_to_unicode(ByteSequence): """Unfortunately, there is no elegant way to do the utf8-decoding safely in libPython, since due to strange behavior of a python narrow build a character >= 0x10000 may appear as a 2 byte string and cannot be handled by 'ord' in python 2.x. Thus: utf8d = codecs.getdecoder("utf-8") return ord(utf8d("".join(map(chr, ByteSequence)))[0]) would be unsafe. That's why we do it by hand here """ # Assume that the byte sequence is valid, thus a byte sequence of length 'N' # has a N - 1 leading ones in the header plus a zero. Remaining bits in the # header are therefore 8 - N. All other bytes in the sequence start with bits '10' # and contain 6 bits of useful payload. header_bit_n = 8 - len(ByteSequence) mask = (1 << header_bit_n) - 1 value = ByteSequence[0] & mask for byte in ByteSequence[1:]: value <<= 6 value |= (byte & 0x3F) # blend off the highest two bits # The highest two bits in a follow byte in utf8 MUST be '10'. Thus: if (byte & 0xC0) != 0x80: error_msg( "Error in UTF8 encoded file. Inadmissible byte sequence detected." ) return value
def check_homogenous_space_counts(self): common = None for character_set, info in self.__map: if info.cc_type != E_CharacterCountType.COLUMN: if info.cc_type == E_CharacterCountType.GRID: return continue elif type(info.value) in (str, unicode): # If there is one single 'variable' grid value, # then no assumptions can be made. return elif common is None: common = info elif common.value != info.value: # space counts are not homogeneous return if common is None: return error_msg("Setup does not contain a grid but only homogeneous space counts of %i.\n" \ % common.value + \ "This setup is equivalent to a setup with space counts of 1. Space counts\n" + \ "of 1 are the fastest to compute.", common.sr, DontExitF=True)
def check_grid_specification(self, Value, sr): if Value == 0: error_msg("A grid count of 0 is nonsense. May be define a space count of 0.", sr) elif Value == 1: error_msg("Indentation grid counts of '1' are equivalent of to a space\n" + \ "count of '1'. The latter is faster to compute.", sr, DontExitF=True)
def do(ARGV): """Performs a query based on the given command line arguments. RETURNS: True if a query was performed. False if not query was requested. """ cl = GetPot(ARGV, SectionsEnabledF=False) success_f = False # Regular Expressions extract the BufferLimitCode and the PathTerminatorCode # from the sets. So let us define them outside the normal range. backup_buffer_limit_code = Setup.buffer_limit_code backup_path_limit_code = Setup.path_limit_code Setup.buffer_limit_code = -1 Setup.path_limit_code = -1 try: success_f = True if search_and_validate(cl, "--codec-info"): __handle_codec(cl) elif search_and_validate(cl, "--codec-file-info"): __handle_codec_file(cl) elif search_and_validate(cl, "--codec-for-language"): __handle_codec_for_language(cl) elif search_and_validate(cl, "--property"): __handle_property(cl) elif search_and_validate(cl, "--set-by-property"): __handle_set_by_property(cl) elif search_and_validate(cl, "--set-by-expression"): __handle_set_by_expression(cl) elif search_and_validate(cl, "--property-match"): __handle_property_match(cl) else: success_f = False except RegularExpressionException, x: error_msg(x.message)
def check_grid_values_integer_multiples(self): """If there are no spaces and the grid is on a homogeneous scale, => then the grid can be transformed into 'easy-to-compute' spaces. """ grid_value_list = [] min_info = None for character_set, info in self.__map: if info.cc_type != E_CharacterCountType.GRID: if info.cc_type == E_CharacterCountType.COLUMN: return continue elif type(info.value) in (str, unicode): # If there is one single 'variable' grid value, # then no assumptions can be made. return grid_value_list.append(info.value) if min_info is None or info.value < min_info.value: min_info = info if min_info is None: return # Are all grid values a multiple of the minimum? if len(filter(lambda x: x % min_info.value == 0, grid_value_list)) != len(grid_value_list): return error_msg("Setup does not contain spaces, only grids (tabulators). All grid\n" \ "widths are multiples of %i. The grid setup %s\n" \ % (min_info.value, repr(sorted(grid_value_list))[1:-1]) + \ "is equivalent to a setup with space counts %s.\n" \ % repr(map(lambda x: x / min_info.value, sorted(grid_value_list)))[1:-1] + \ "Space counts are faster to compute.", min_info.sr, DontExitF=True)
def __check_file_name(setup, Candidate, Name, Index=None, CommandLineOption=None): value = setup.__dict__[Candidate] if len(value) == 0: return if CommandLineOption is None: CommandLineOption = command_line_args(Candidate) if Index is not None: if type(value) != list or len(value) <= Index: value = "" else: value = value[Index] if type(value) == list: for name in value: if name != "" and name[0] == "-": error_msg("Quex refuses to work with file names that start with '-' (minus).\n" + \ "Received '%s' for %s (%s)" % (value, name, repr(CommandLineOption)[1:-1])) if os.access(name, os.F_OK) == False: # error_msg("File %s (%s)\ncannot be found." % (name, Name)) error_msg_file_not_found(name, Name) else: if value == "" or value[0] == "-": return if os.access(value, os.F_OK): return if os.access(QUEX_PATH + "/" + value, os.F_OK): return if os.access(os.path.dirname(value), os.F_OK) == False \ and os.access(QUEX_PATH + "/" + os.path.dirname(value), os.F_OK) == False: error_msg("File '%s' is supposed to be located in directory '%s' or\n" % \ (os.path.basename(value), os.path.dirname(value)) + \ "'%s'. No such directories exist." % \ (QUEX_PATH + "/" + os.path.dirname(value))) error_msg_file_not_found(value, Name)
def __get_float(MemberName): ValueStr = setup.__dict__[MemberName] if type(ValueStr) == float: return ValueStr try: return float(ValueStr) except: option_name = repr(SETUP_INFO[MemberName][0])[1:-1] error_msg("Cannot convert '%s' into an floating point number for '%s'" % (ValueStr, option_name))
def open_data_base_file(Filename): try: fh = open(unicode_db_directory + "/" + Filename, "rb") except: error_msg("Fatal---Unicode Database File '%s' not found!\n" % Filename + \ "QUEX_PATH='%s'\n" % QUEX_PATH + \ "Unicode Database Directory: '%s'" % unicode_db_directory) return fh
def consistency_check(self): self.count_command_map.check_defined(self.sr, E_CharacterCountType.WHITESPACE) self.count_command_map.check_defined(self.sr, E_CharacterCountType.BEGIN_NEWLINE) if self.sm_newline_suppressor.get() is not None: if self.sm_newline.get() is None: error_msg("A newline 'suppressor' has been defined.\n" "But there is no 'newline' in indentation defintion.", self.sm_newline_suppressor.sr)
def set(self, Value, fh): if self.__value is not None: error_msg("%s has been defined more than once.\n" % self.name, fh, DontExitF=True) error_msg("previous definition has been here.\n", self.file_name, self.line_n) self.__value = Value self.file_name = fh.name self.line_n = get_current_line_info_number(fh)
def __check_identifier(setup, Candidate, Name): value = setup.__dict__[Candidate] if is_identifier(value): return CommandLineOption = SETUP_INFO[Candidate][0] error_msg("%s must be a valid identifier (%s).\n" % (Name, repr(CommandLineOption)[1:-1]) + \ "Received: '%s'" % value)
def parse_distinct_members(fh, section_name, descriptor, already_defined_list): if not check(fh, "{"): error_msg("Missing opening '{' at begin of token_type section '%s'." % section_name, fh); result = parse_variable_definition_list(fh, "distinct", already_defined_list) if result == {}: error_msg("Missing variable definition in token_type 'distinct' section.", fh) descriptor.distinct_db = result
def snap_replacement(stream, PatternDict, StateMachineF=True): """Snaps a predefined pattern from the input string and returns the resulting state machine. """ skip_whitespace(stream) pattern_name = read_identifier(stream) if pattern_name == "": raise RegularExpressionException( "Pattern replacement expression misses identifier after '{'.") skip_whitespace(stream) if not check(stream, "}"): raise RegularExpressionException("Pattern replacement expression misses closing '}' after '%s'." \ % pattern_name) verify_word_in_list( pattern_name, PatternDict.keys(), "Specifier '%s' not found in any preceeding 'define { ... }' section." % pattern_name, stream) reference = PatternDict[pattern_name] assert reference.__class__.__name__ == "PatternShorthand" # The replacement may be a state machine or a number set if StateMachineF: # Get a cloned version of state machine state_machine = reference.get_state_machine() assert isinstance(state_machine, StateMachine) # It is essential that state machines defined as patterns do not # have origins. Otherwise, the optimization of patterns that # contain pattern replacements might get confused and can # not find all optimizations. assert state_machine.has_origins() == False # A state machine, that contains pre- or post- conditions cannot be part # of a replacement. The addition of new post-contexts would mess up the pattern. ## if state_machine.has_pre_or_post_context(): ## error_msg("Pre- or post-conditioned pattern was used in replacement.\n" + \ ## "Quex's regular expression grammar does not allow this.", stream) return state_machine else: # Get a cloned version of character set character_set = reference.get_character_set() if character_set is None: error_msg( "Replacement in character set expression must be a character set.\n" "Specifier '%s' relates to a pattern state machine." % pattern_name, stream) if character_set.is_empty(): error_msg( "Referenced character set '%s' is empty.\nAborted." % pattern_name, stream) return character_set