def test(Text): global count_n count_n += 1 if Text.find("\n") == -1: print "(%i) |%s|\n" % (count_n, Text) else: print "(%i)\n::\n%s\n::\n" % (count_n, Text) sh = StringIO(Text) sh.name = "test_string" descr = None if "debug" in sys.argv and "%s" % count_n == sys.argv[3]: # Try beyond an exception catcher descr = counter.LineColumnCount_Prep(sh).parse() try: descr = counter.LineColumnCount_Prep(sh).parse() except EndOfStreamException: error.log("End of file reached while parsing 'counter' section.", sh, DontExitF=True) except: print "Exception!" if descr is not None: print descr print
def read_integer(fh): pos = fh.tell() base, digit_list = get_number_base(fh) if base is None: return None txt = "" while 1 + 1 == 2: tmp = fh.read(1) if tmp == "": break elif tmp not in digit_list: fh.seek(-1, 1); break txt += tmp # If we drop out on a digit, then let us assume that the user just missed a point if tmp.isdigit() or (tmp in list("ABCDEFabcdef")): error.log("Digit '%s' cannot be part of an expression of base %s." % (tmp, base), fh) txt = txt.replace(".", "") if len(txt) == 0: if base in [2, 8, 16, "roman", "Napier"]: error.log("Missing digits after for number of base %s, found '%s'." % (str(base), tmp), fh) fh.seek(pos) return None # Octal, decimal, and hexadecimal numbers if base in [2, 8, 10, 16]: return int(txt, base) elif base == "roman": return __roman_number(txt, fh) elif base == "Napier": return __napier_number(txt, fh) else: return __binary_number(txt, fh)
def __roman_number(Text, fh): """Source: http://code.activestate.com -- Code Recipes Recipe 81611 by Paul Winkler. """ input = Text.upper() # map of (numeral, value, maxcount) tuples roman_numeral_map = (('M', 1000, None), ('CM', 900, 1), ('D', 500, 1), ('CD', 400, 1), ('C', 100, 3), ('XC', 90, 1), ('L', 50, 1), ('XL', 40, 1), ('X', 10, 3), ('IX', 9, 1), ('V', 5, 1), ('IV', 4, 1), ('I', 1, 3)) result, index = 0, 0 for numeral, value, maxcount in roman_numeral_map: count = 0 while input[index: index + len(numeral)] == numeral: count += 1 # how many of this numeral we have if maxcount is not None and count > maxcount: error.log("input 0r%s is not a valid roman numeral." % Text, fh) result += value index += len(numeral) if index < len(input): # There are characters unaccounted for. error.log("input 0r%s is not a valid roman numeral." % Text, fh) return result
def __parse_range_skipper_option(fh, identifier, new_mode): """A non-nesting skipper can contain a full fledged regular expression as opener, since it only effects the trigger. Not so the nested range skipper-see below. """ # Range state machines only accept 'strings' not state machines # Pattern: opener 'white space' closer 'white space' '>' skip_whitespace(fh) opener_pattern = regular_expression.parse_non_precontexted_pattern( fh, identifier, ">", AllowNothingIsFineF=False) _assert_pattern_constaints(opener_pattern, "Skip range opener", fh) skip_whitespace(fh) closer_pattern = regular_expression.parse_non_precontexted_pattern( fh, identifier, ">", AllowNothingIsFineF=True) _assert_pattern_constaints(closer_pattern, "Skip range closer", fh) opener_pattern.set_pattern_string("<%s open>" % identifier) closer_pattern.set_pattern_string("<%s close>" % identifier) # -- closer skip_whitespace(fh) if fh.read(1) != ">": error.log("missing closing '>' for mode option '%s'" % identifier, fh) return SkipRangeData(opener_pattern, closer_pattern)
def argv_is_query_option(Cl, Option, Name, PrevQueryF): """Determines whether the setup parameter is a parameter related to queries (or to code generation). If a mixed usage is detected an error is issued. RETURN: query flag The query flag is the same as QueryF, except for one case: when QueryF was None (unset) and the option appeared on the command line. Then, the return value tells whether the option was a query flag or not. ERROR: If there are mixed options, i.e. query flags and code generation flags appear at the same time. """ query_f = (Name.find("query_") == 0) if PrevQueryF is None: return query_f elif PrevQueryF == query_f: return query_f # If debug exception is enabled, do not trigger errror if Cl.search(SETUP_INFO["_debug_exception_f"][0]): return query_f error.log("Mixed options: query and code generation mode.\n" "The option(s) '%s' cannot be combined with preceeding options." \ % str(SETUP_INFO[Name][0])[1:-1].replace("'",""))
def split_first_transition(SmList): """Perform separation: state machine ----> first transition + appendix state machine for each state machine. RETURNS: list of(character set, appendix state machine) Character sets MAY INTERSECT, and MAY REQUIRE NON-UNIFORM count actions. """ result = [] appendix_sm_to_iid_original_db = {} for original_sm in SmList: iid_original = original_sm.get_id() for first_set, appendix_sm in _cut_first_transition( original_sm, CloneStateMachineId=False): # Every appendix DFA gets its own 'id'. # HOWEVER: Multiple appendix DFAs might match to same 'acceptance id', # => Such DFAs transit to same terminal upon acceptance. appendix_sm.mark_state_origins() result.append((first_set, appendix_sm)) assert appendix_sm.get_id() not in appendix_sm_to_iid_original_db appendix_sm_to_iid_original_db[appendix_sm.get_id()] = iid_original for character_set, appendix_sm in result: init_state = appendix_sm.get_init_state() if init_state.input_position_store_f(): error.log( "skip/skip_range/indentation/counter implementation.\n" "Inadmissible post context after first character.\n" "(This should have been detected during the parsing process)") return result, appendix_sm_to_iid_original_db
def parse_standard_members(fh, section_name, descriptor, already_defined_list): if not check(fh, "{"): error.log("Missing opening '{' at begin of token_type section '%s'." % section_name, fh); position = fh.tell() while 1 + 1 == 2: try: result = parse_variable_definition(fh) except EndOfStreamException: fh.seek(position) error.error_eof("standard", fh) if result is None: return type_code_fragment, name = result[0], result[1] __validate_definition(type_code_fragment, name, already_defined_list, StandardMembersF=True) if name == "id": descriptor.token_id_type = type_code_fragment elif name == "column_number": descriptor.column_number_type = type_code_fragment elif name == "line_number": descriptor.line_number_type = type_code_fragment else: assert False # This should have been caught by the variable parser function already_defined_list.append([name, type_code_fragment])
def run(cl, Argv): if Setup.query_version_f: print_version(); return elif Setup.query_help_f: print_help(); return # Regular Expressions extract the BufferLimitCode and the PathTerminatorCode # from the sets. So let us define them outside the normal range. backup_buffer_limit_code = Setup.buffer_limit_code backup_path_limit_code = Setup.path_limit_code Setup.buffer_limit_code = -1 Setup.path_limit_code = -1 try: if Setup.query_codec: __handle_codec(cl) elif Setup.query_codec_list: __handle_codec_list(cl) elif Setup.query_codec_file: __handle_codec_file(cl) elif Setup.query_codec_language: __handle_codec_for_language(cl) elif Setup.query_property is not None: __handle_property(cl) elif Setup.query_set_by_property: __handle_set_by_property(cl) elif Setup.query_set_by_expression: __handle_set_by_expression(cl) elif Setup.query_property_match: __handle_property_match(cl) else: assert False # No query option(s) ! except RegularExpressionException, x: error.log(x.message)
def __parse_event(new_mode, fh, word): pos = fh.tell() # Allow '<<EOF>>' and '<<FAIL>>' out of respect for classical tools like 'lex' if word == "<<EOF>>": word = "on_end_of_stream" elif word == "<<FAIL>>": word = "on_failure" elif word in blackboard.all_section_title_list: error.log("Pattern '%s' is a quex section title. Has the closing '}' of mode %s \n" % (word, new_mode.name) \ + "been forgotten? Else use quotes, i.e. \"%s\"." % word, fh) elif len(word) < 3 or word[:3] != "on_": return False comment = "Unknown event handler '%s'. \n" % word + \ "Note, that any pattern starting with 'on_' is considered an event handler.\n" + \ "use double quotes to bracket patterns that start with 'on_'." __general_validate(fh, new_mode, word, pos) error.verify_word_in_list(word, standard_incidence_db.keys() + ["keyword_list"], comment, fh) code = code_fragment.parse(fh, "%s::%s event handler" % (new_mode.name, word)) incidence_id = standard_incidence_db[word][0] if Lng.suspicious_RETURN_in_event_handler(incidence_id, code.get_text()): error.warning("Suspicious 'RETURN' in event handler '%s'.\n" % incidence_id \ + "This statement will trigger 'on_after_match' handler.\n" \ + "May be, use plain return instead.", code.sr) if word == "on_n_dedent" and not token_db.support_repetition(): error.warning("Found 'on_n_dedent', but no single token has been specified\n" \ "in a 'repeated_token' section.", code.sr) new_mode.incidence_db[word] = code return True
def __general_validate(fh, Mode, Name, pos): if Name == "on_indentation": fh.seek(pos) error.log("Definition of 'on_indentation' is no longer supported since version 0.51.1.\n" "Please, use 'on_indent' for the event of an opening indentation, 'on_dedent'\n" "for closing indentation, and 'on_nodent' for no change in indentation.", fh) def error_dedent_and_ndedent(code, A, B): error.log("Indentation event handler '%s' cannot be defined, because\n" % A, fh, DontExitF=True) error.log("the alternative '%s' has already been defined." % B, code.sr) if Name == "on_dedent" and Mode.incidence_db.has_key("on_n_dedent"): fh.seek(pos) code = Mode.incidence_db["on_n_dedent"] if not code.is_whitespace(): error_dedent_and_ndedent(code, "on_dedent", "on_n_dedent") if Name == "on_n_dedent" and Mode.incidence_db.has_key("on_dedent"): fh.seek(pos) code = Mode.incidence_db["on_dedent"] if not code.is_whitespace(): error_dedent_and_ndedent(code, "on_n_dedent", "on_dedent")
def __prepare_buffer_element_specification(setup): global global_character_type_db if Setup.buffer_lexatom_size_in_byte == "wchar_t": error.log("Since Quex version 0.53.5, 'wchar_t' can no longer be specified\n" "with option '--buffer-element-size' or '-bes'. Please, specify\n" "'--buffer-element-type wchar_t' or '--bet'.") if Setup.buffer_lexatom_type == "wchar_t": Setup.converter_ucs_coding_name = "WCHAR_T" # (*) Determine buffer element type and size (in bytes) lexatom_size_in_byte = Setup.buffer_lexatom_size_in_byte if lexatom_size_in_byte == -1: if global_character_type_db.has_key(Setup.buffer_lexatom_type): lexatom_size_in_byte = global_character_type_db[Setup.buffer_lexatom_type][3] elif Setup.buffer_lexatom_type == "": lexatom_size_in_byte = 1 else: # Buffer element type is not identified in 'global_character_type_db'. # => here Quex cannot know its size on its own. lexatom_size_in_byte = -1 if Setup.buffer_lexatom_type == "": if lexatom_size_in_byte in [1, 2, 4]: Setup.buffer_lexatom_type = { 1: "uint8_t", 2: "uint16_t", 4: "uint32_t", }[lexatom_size_in_byte] elif lexatom_size_in_byte == -1: pass else: error.log("Buffer element type cannot be determined for size '%i' which\n" \ % lexatom_size_in_byte + "has been specified by '-b' or '--buffer-element-size'.") return lexatom_size_in_byte
def __parse_event(new_mode, fh, word): pos = fh.tell() # Allow '<<EOF>>' and '<<FAIL>>' out of respect for classical tools like 'lex' if word == "<<EOF>>": word = "on_end_of_stream" elif word == "<<FAIL>>": word = "on_failure" elif word in blackboard.all_section_title_list: error.log("Pattern '%s' is a quex section title. Has the closing '}' of mode %s \n" % (word, new_mode.name) \ + "been forgotten? Else use quotes, i.e. \"%s\"." % word, fh) elif len(word) < 3 or word[:3] != "on_": return False comment = "Unknown event handler '%s'. \n" % word + \ "Note, that any pattern starting with 'on_' is considered an event handler.\n" + \ "use double quotes to bracket patterns that start with 'on_'." __general_validate(fh, new_mode, word, pos) error.verify_word_in_list(word, standard_incidence_db.keys(), comment, fh) __validate_required_token_policy_queue(word, fh, pos) continue_f = True if word == "on_end_of_stream" or word == "on_failure": # -- When a termination token is sent, no other token shall follow. # => Enforce return from the analyzer! Do not allow CONTINUE! # -- When an 'on_failure' is received allow immediate action of the # receiver => Do not allow CONTINUE! continue_f = False new_mode.incidence_db[word] = \ code_fragment.parse(fh, "%s::%s event handler" % (new_mode.name, word), ContinueF=continue_f) return True
def __parse_base_mode_list(fh, new_mode): new_mode.derived_from_list = [] trailing_comma_f = False while 1 + 1 == 2: if check(fh, "{"): fh.seek(-1, 1); break elif check(fh, "<"): fh.seek(-1, 1); break skip_whitespace(fh) identifier = read_identifier(fh) if identifier == "": break new_mode.derived_from_list.append(identifier) trailing_comma_f = False if not check(fh, ","): break trailing_comma_f = True if trailing_comma_f: error.warning("Trailing ',' after base mode '%s'." % new_mode.derived_from_list[-1], fh) elif len(new_mode.derived_from_list) != 0: # This check is a 'service' -- for those who follow the old convention pos = fh.tell() skip_whitespace(fh) dummy_identifier = read_identifier(fh) if dummy_identifier != "": error.log("Missing separating ',' between base modes '%s' and '%s'.\n" \ % (new_mode.derived_from_list[-1], dummy_identifier) + \ "(The comma separator is mandatory since quex 0.53.1)", fh) fh.seek(pos)
def parse(fh): """This function parses a mode description and enters it into the 'blackboard.mode_description_db'. Once all modes are parsed they can be translated into 'real' modes and are located in 'blackboard.mode_db'. """ # NOTE: Catching of EOF happens in caller: parse_section(...) skip_whitespace(fh) mode_name = read_identifier(fh, OnMissingStr="Missing identifier at beginning of mode definition.") # NOTE: constructor does register this mode in the mode_db new_mode = ModeDescription(mode_name, SourceRef.from_FileHandle(fh)) # (*) inherited modes / option_db skip_whitespace(fh) dummy = fh.read(1) if dummy not in [":", "{"]: error.log("missing ':' or '{' after mode '%s'" % mode_name, fh) if dummy == ":": __parse_option_list(new_mode, fh) # (*) read in pattern-action pairs and events while __parse_element(new_mode, fh): pass
def parse(fh): """Parses pattern definitions of the form: WHITESPACE [ \t\n] IDENTIFIER [a-zA-Z0-9]+ OP_PLUS "+" \function SOMETHING(sm = X, set = Y, number = N): That means: 'name' whitespace 'regular expression' whitespace newline. Comments can only be '//' nothing else and they have to appear at the beginning of the line. One regular expression can have more than one name, but one name can only have one regular expression. """ skip_whitespace(fh) if not check(fh, "{"): error.log("define region must start with opening '{'.", fh) while 1 + 1 == 2: skip_whitespace(fh) if check(fh, "}"): return # Get the name of the pattern skip_whitespace(fh) if check(fh, "\\function"): name, value = _parse_function(fh) else: name, value = _parse_pattern(fh) blackboard.shorthand_db[name] = value
def do(BufferCodecName, BufferCodecFileName=""): from quex.engine.state_machine.transformation.base import EncodingTrafoUnicode from quex.engine.state_machine.transformation.table import EncodingTrafoByTable from quex.engine.state_machine.transformation.utf8_state_split import EncodingTrafoUTF8 from quex.engine.state_machine.transformation.utf16_state_split import EncodingTrafoUTF16 if BufferCodecName == "utf8": return EncodingTrafoUTF8() elif BufferCodecName == "utf16": return EncodingTrafoUTF16() elif BufferCodecFileName: os.path.splitext(os.path.basename(BufferCodecFileName)) try: os.path.splitext(os.path.basename(BufferCodecFileName)) except: error.log("cannot interpret string following '--codec-file'") return EncodingTrafoByTable(FileName=BufferCodecFileName) elif BufferCodecName == "unicode": # (Still, 'icu' or 'iconv' may provide converted content, but ...) # If the internal buffer is 'unicode', then the pattern's state # machines are not converted. The requirement for the pattern's # range is the same as for the 'buffer element chunks'. return EncodingTrafoUnicode(NumberSet(Interval(0, 0x110000)), NumberSet(Interval(0, 0x110000))) elif BufferCodecName == "unit-test": return EncodingTrafoUnicode(NumberSet_All(), NumberSet_All()) else: return EncodingTrafoByTable(BufferCodecName)
def test(Text): global count_n count_n += 1 if Text.find("\n") == -1: print "(%i) |%s|\n" % (count_n, Text) else: print "(%i)\n::\n%s\n::\n" % (count_n, Text) sh = StringIO(Text) sh.name = "test_string" descr = None # descr = counter.parse_line_column_counter(sh) try: descr = counter.parse_line_column_counter(sh) pass except EndOfStreamException: error.log("End of file reached while parsing 'counter' section.", sh, DontExitF=True) except: print "Exception!" if descr is not None: print descr.count_command_map print
def get_character_value_limit(self): """A buffer element is a chunk of memory of the size of the granularity of which the input pointer increases. For fixed size codecs, such as ASCII or UCS32, the BUFFER ELEMENT VALUE LIMIT is exactly the same as the CHARACTER VALUE LIMIT. However, for dynamic sized codecs, such as UTF8 or UTF16, they are different. In UTF8, the input pointer increments by one byte on each state transition. However, a character may consist out of multiple bytes. The buffer element value limit is 256, but the character value limit is the whole range. RETURNS: Integer = supremum of possible character range, i.e. one character behind the last possible. sys.maxint, if no such limit exists. """ buffer_element_size = self.buffer_element_size if buffer_element_size == -1: return sys.maxint try: result = 256 ** buffer_element_size except: error.log("Error while trying to compute 256 to the 'buffer-element-size' (%i bytes)\n" \ % buffer_element_size + \ "Adapt \"--buffer-element-size\" or \"--buffer-element-type\",\n" + \ "or specify '--buffer-element-size-irrelevant' to ignore the issue.") if result > sys.maxint: return sys.maxint else: return result
def utf8_to_unicode(ByteSequence): """Unfortunately, there is no elegant way to do the utf8-decoding safely in libPython, since due to strange behavior of a python narrow build a character >= 0x10000 may appear as a 2 byte string and cannot be handled by 'ord' in python 2.x. Thus: utf8d = codecs.getdecoder("utf-8") return ord(utf8d("".join(map(chr, ByteSequence)))[0]) would be unsafe. That's why we do it manually. """ # Assume that the byte sequence is valid, thus a byte sequence of length 'N' # has a N - 1 leading ones in the header plus a zero. Remaining bits in the # header are therefore 8 - N. All other bytes in the sequence start with bits '10' # and contain 6 bits of useful payload. header_bit_n = 8 - len(ByteSequence) mask = (1 << header_bit_n) - 1 value = ByteSequence[0] & mask for byte in ByteSequence[1:]: value <<= 6 value |= (byte & 0x3F) # blend off the highest two bits # The highest two bits in a follow byte in utf8 MUST be '10'. Thus: if (byte & 0xC0) != 0x80: error.log( "Error in UTF8 encoded file. Inadmissible byte sequence detected. Found byte '%02X'" % byte) return value
def buffer_element_specification_prepare(self): global global_character_type_db if self.buffer_element_size == "wchar_t": error.log("Since Quex version 0.53.5, 'wchar_t' can no longer be specified\n" "with option '--buffer-element-size' or '-bes'. Please, specify\n" "'--buffer-element-type wchar_t' or '--bet'.") if self.buffer_element_type == "wchar_t": self.converter_ucs_coding_name = "WCHAR_T" # (*) Determine buffer element type and size (in bytes) if self.buffer_element_size == -1: if global_character_type_db.has_key(self.buffer_element_type): self.buffer_element_size = global_character_type_db[self.buffer_element_type][3] elif self.buffer_element_type == "": self.buffer_element_size = 1 else: # Buffer element type is not identified in 'global_character_type_db'. # => here Quex cannot know its size on its own. self.buffer_element_size = -1 if self.buffer_element_type == "": if self.buffer_element_size in [1, 2, 4]: self.buffer_element_type = { 1: "uint8_t", 2: "uint16_t", 4: "uint32_t", }[self.buffer_element_size] elif self.buffer_element_size == -1: pass else: error.log("Buffer element type cannot be determined for size '%i' which\n" \ % self.buffer_element_size + "has been specified by '-b' or '--buffer-element-size'.") self.__buffer_element_specification_done_f = True
def _error_if_defined_before(Before, sr): if not Before.set_f(): return error.log("'%s' has been defined before;" % Before.name, sr, DontExitF=True) error.log("at this place.", Before.sr)
def buffer_codec_prepare(self, BufferCodecName, BufferCodecFileName=None, Module=None): """Determines: Setup.buffer_codec_name Setup.buffer_codec """ assert BufferCodecName == "unit-test" \ or self.__buffer_element_specification_done_f == True if BufferCodecName in ("utf8", "utf16"): assert Module is not None result = codec_db.CodecDynamicInfo(BufferCodecName, Module) elif BufferCodecFileName: os.path.splitext(os.path.basename(BufferCodecFileName)) try: os.path.splitext(os.path.basename(BufferCodecFileName)) except: error.log("cannot interpret string following '--codec-file'") result = codec_db.CodecTransformationInfo(FileName=BufferCodecFileName) elif BufferCodecName == "unicode": # (Still, 'icu' or 'iconv' may provide converted content, but ...) # If the internal buffer is 'unicode', then the pattern's state # machines are not converted. The requirement for the pattern's # range is the same as for the 'buffer element chunks'. result = codec_db.CodecInfo("unicode", NumberSet.from_range(0, self.get_character_value_limit()), NumberSet.from_range(0, self.get_character_value_limit())) elif BufferCodecName == "unit-test": result = codec_db.CodecInfo("unicode", NumberSet.from_range(-sys.maxint, sys.maxint), NumberSet.from_range(-sys.maxint, sys.maxint)) else: result = codec_db.CodecTransformationInfo(BufferCodecName) self.buffer_codec = result
def parse(fh, CodeFragmentName, ErrorOnFailureF=True, AllowBriefTokenSenderF=True, ContinueF=True): """RETURNS: An object of class CodeUser containing line number, filename, and the code fragment. None in case of failure. """ assert type(ErrorOnFailureF) == bool assert type(AllowBriefTokenSenderF) == bool skip_whitespace(fh) word = fh.read(2) if len(word) >= 1 and word[0] == "{": if len(word) > 1: fh.seek(-1, 1) # unput the second character return __parse_normal(fh, CodeFragmentName) elif AllowBriefTokenSenderF and word == "=>": return __parse_brief_token_sender(fh, ContinueF) elif not ErrorOnFailureF: fh.seek(-2, 1) return None else: error.log( "Missing code fragment after %s definition." % CodeFragmentName, fh)
def __check_file_name(setup, Candidate, Name, Index=None, CommandLineOption=None): value = setup.__dict__[Candidate] if len(value) == 0: return if CommandLineOption is None: CommandLineOption = command_line_args(Candidate) if Index is not None: if type(value) != list or len(value) <= Index: value = "" else: value = value[Index] if type(value) == list: for name in value: if name != "" and name[0] == "-": error.log("Quex refuses to work with file names that start with '-' (minus).\n" + \ "Received '%s' for %s (%s)" % (value, name, repr(CommandLineOption)[1:-1])) if os.access(name, os.F_OK) == False: # error.log("File %s (%s)\ncannot be found." % (name, Name)) error.log_file_not_found(name, Name) else: if value == "" or value[0] == "-": return if os.access(value, os.F_OK): return if os.access(QUEX_PATH + "/" + value, os.F_OK): return if os.access(os.path.dirname(value), os.F_OK) == False \ and os.access(QUEX_PATH + "/" + os.path.dirname(value), os.F_OK) == False: error.log("File '%s' is supposed to be located in directory '%s' or\n" % \ (os.path.basename(value), os.path.dirname(value)) + \ "'%s'. No such directories exist." % \ (QUEX_PATH + "/" + os.path.dirname(value))) error.log_file_not_found(value, Name)
def error_dedent_and_ndedent(code, A, B): error.log( "Indentation event handler '%s' cannot be defined, because\n" % A, fh, DontExitF=True) error.log("the alternative '%s' has already been defined." % B, code.sr)
def __parse(fh, result, IndentationSetupF=False): """Parses pattern definitions of the form: [ \t] => grid 4; [:intersection([:alpha:], [\X064-\X066]):] => space 1; In other words the right hand side *must* be a character set. """ # NOTE: Catching of EOF happens in caller: parse_section(...) # while 1 + 1 == 2: skip_whitespace(fh) if check(fh, ">"): break # A regular expression state machine pattern, identifier, sr = __parse_definition_head(fh, result) if pattern is None and IndentationSetupF: error.log("Keyword '\\else' cannot be used in indentation setup.", fh) # '__parse_definition_head()' ensures that only identifiers mentioned in # 'result' are accepted. if not IndentationSetupF: value = read_value_specifier(fh, identifier, 1) result.specify(identifier, pattern, value, sr) else: result.specify(identifier, pattern, sr) if not check(fh, ";"): error.log("Missing ';' after '%s' specification." % identifier, fh) return result
def unicode_to_utf8(UnicodeValue): if UnicodeValue < 0x80: return [ UnicodeValue, ] elif UnicodeValue < 0x800: # Bits: 5 + 6 return [ 0xC0 | ((UnicodeValue >> 6) & 0x1F), 0x80 | ((UnicodeValue ) & 0x3F)] elif UnicodeValue < 0x10000: # Bits: 4 + 6 + 6 return [ 0xE0 | ((UnicodeValue >> 12) & 0x0F), 0x80 | ((UnicodeValue >> 6) & 0x3F), 0x80 | ((UnicodeValue ) & 0x3F)] elif UnicodeValue < 0x00200000: # Bits: 3 + 6 + 6 + 6 return [ 0xF0 | ((UnicodeValue >> 18) & 0x07), 0x80 | ((UnicodeValue >> 12) & 0x3F), 0x80 | ((UnicodeValue >> 6) & 0x3F), 0x80 | ((UnicodeValue ) & 0x3F)] elif UnicodeValue < 0x04000000L: # Bits: 2 + 6 + 6 + 6 + 6 return [ 0xF0 | ((UnicodeValue >> 24) & 0x03), 0x80 | ((UnicodeValue >> 18) & 0x3F), 0x80 | ((UnicodeValue >> 12) & 0x3F), 0x80 | ((UnicodeValue >> 6) & 0x3F), 0x80 | ((UnicodeValue ) & 0x3F)] elif UnicodeValue < 0x80000000L: # Bits: 1 + 6 + 6 + 6 + 6 + 6 return [ 0xF0 | ((UnicodeValue >> 30) & 0x01), 0x80 | ((UnicodeValue >> 24) & 0x3F), 0x80 | ((UnicodeValue >> 18) & 0x3F), 0x80 | ((UnicodeValue >> 12) & 0x3F), 0x80 | ((UnicodeValue >> 6) & 0x3F), 0x80 | ((UnicodeValue ) & 0x3F)] else: error.log("Unicode character 0x%8X > 0x7FFFFFFF detected. Cannot be handled." % UnicodeValue)
def utf8_to_unicode(ByteSequence): """Unfortunately, there is no elegant way to do the utf8-decoding safely in libPython, since due to strange behavior of a python narrow build a character >= 0x10000 may appear as a 2 byte string and cannot be handled by 'ord' in python 2.x. Thus: utf8d = codecs.getdecoder("utf-8") return ord(utf8d("".join(map(chr, ByteSequence)))[0]) would be unsafe. That's why we do it by hand here """ # Assume that the byte sequence is valid, thus a byte sequence of length 'N' # has a N - 1 leading ones in the header plus a zero. Remaining bits in the # header are therefore 8 - N. All other bytes in the sequence start with bits '10' # and contain 6 bits of useful payload. header_bit_n = 8 - len(ByteSequence) mask = (1 << header_bit_n) - 1 value = ByteSequence[0] & mask for byte in ByteSequence[1:]: value <<= 6 value |= (byte & 0x3F) # blend off the highest two bits # The highest two bits in a follow byte in utf8 MUST be '10'. Thus: if (byte & 0xC0) != 0x80: error.log("Error in UTF8 encoded file. Inadmissible byte sequence detected. Found byte '%02X'" % byte) return value
def do(file_list): if not file_list and not (Setup.token_class_only_f or Setup.converter_only_f): error.log("No input files.") mode_prep_prep_db = {} # mode name --> Mode_PrepPrep object # # later: Mode_PrepPrep is transformed into Mode objects. # If a foreign token-id file was presented even the standard token ids # must be defined there. if not Setup.extern_token_id_file: prepare_default_standard_token_ids() for file_name in file_list: error.insight("File '%s'" % file_name) fh = open_file_or_die(file_name, CodecCheckF=True) # read all modes until end of file try: while 1 + 1 == 2: parse_section(fh, mode_prep_prep_db) except EndOfStreamException: pass except RegularExpressionException, x: error.log(x.message, fh)
def detect_path_of_nothing_is_necessary(sm, Name, PostContextPresentF, fh): assert Name in ["", "pre-context", "post-context"] if sm is None: return elif not sm.get_init_state().is_acceptance(): return if len(Name) == 0: name_str = "core pattern" else: name_str = Name msg = "The %s contains in a 'nothing is necessary' path in the state machine.\n" \ % name_str + \ "This means, that without reading a character the analyzer drops into\n" + \ "an acceptance state. " msg += { "": "The analyzer would then stall.", "pre-context": "E.g., pattern 'x*/y/' means that zero or more 'x' are a pre-\n" + \ "condition for 'y'. If zero appearances of 'x' are enough, then obviously\n" + \ "there is no pre-context for 'y'! Most likely the author intended 'x+/y/'.", "post-context": "A post context where nothing is necessary is superfluous.", }[Name] if Name != "post-context" and PostContextPresentF: msg += "\n" \ "Note: A post context does not change anything to that fact." error.log(msg, fh)
def parse(fh): """This function parses a mode description and enters it into the 'blackboard.mode_description_db'. Once all modes are parsed they can be translated into 'real' modes and are located in 'blackboard.mode_db'. """ # NOTE: Catching of EOF happens in caller: parse_section(...) skip_whitespace(fh) mode_name = read_identifier( fh, OnMissingStr="Missing identifier at beginning of mode definition.") # NOTE: constructor does register this mode in the mode_db new_mode = ModeDescription(mode_name, SourceRef.from_FileHandle(fh)) # (*) inherited modes / option_db skip_whitespace(fh) dummy = fh.read(1) if dummy not in [":", "{"]: error.log("missing ':' or '{' after mode '%s'" % mode_name, fh) if dummy == ":": __parse_option_list(new_mode, fh) # (*) read in pattern-action pairs and events while __parse_element(new_mode, fh): pass
def __general_validate(fh, Mode, Name, pos): if Name == "on_indentation": fh.seek(pos) error.log( "Definition of 'on_indentation' is no longer supported since version 0.51.1.\n" "Please, use 'on_indent' for the event of an opening indentation, 'on_dedent'\n" "for closing indentation, and 'on_nodent' for no change in indentation.", fh) def error_dedent_and_ndedent(code, A, B): error.log( "Indentation event handler '%s' cannot be defined, because\n" % A, fh, DontExitF=True) error.log("the alternative '%s' has already been defined." % B, code.sr) if Name == "on_dedent" and Mode.incidence_db.has_key("on_n_dedent"): fh.seek(pos) code = Mode.incidence_db["on_n_dedent"] if not code.is_whitespace(): error_dedent_and_ndedent(code, "on_dedent", "on_n_dedent") if Name == "on_n_dedent" and Mode.incidence_db.has_key("on_dedent"): fh.seek(pos) code = Mode.incidence_db["on_dedent"] if not code.is_whitespace(): error_dedent_and_ndedent(code, "on_n_dedent", "on_dedent")
def __error_detection(not_found_list, recursive_list): ErrorN = NotificationDB.token_id_ignored_files_report if ErrorN not in Setup.suppressed_notification_list: if not_found_list: not_found_list.sort() sr = SourceRef(not_found_list[0][0], LineN=not_found_list[0][1]) error.warning("Files not found:", sr) for file_name, line_n, included_file in not_found_list: error.warning("%s" % included_file, SourceRef(file_name, line_n)) if recursive_list: recursive_list.sort() sr = SourceRef(recursive_list[0][0], LineN=recursive_list[0][1]) error.warning( "Files recursively included (ignored second inclusion):", sr) for file_name, line_n, included_file in recursive_list: error.warning("%s" % included_file, SourceRef(file_name, line_n)) if not_found_list or recursive_list: # source reference is taken from last setting error.log("\nQuex does not handle C-Preprocessor instructions.", sr, NoteF=True, DontExitF=True, SuppressCode=ErrorN)
def __roman_number(Text, fh): """Source: http://code.activestate.com -- Code Recipes Recipe 81611 by Paul Winkler. """ input = Text.upper() # map of (numeral, value, maxcount) tuples roman_numeral_map = (('M', 1000, None), ('CM', 900, 1), ('D', 500, 1), ('CD', 400, 1), ('C', 100, 3), ('XC', 90, 1), ('L', 50, 1), ('XL', 40, 1), ('X', 10, 3), ('IX', 9, 1), ('V', 5, 1), ('IV', 4, 1), ('I', 1, 3)) result, index = 0, 0 for numeral, value, maxcount in roman_numeral_map: count = 0 while input[index:index + len(numeral)] == numeral: count += 1 # how many of this numeral we have if maxcount is not None and count > maxcount: error.log("input 0r%s is not a valid roman numeral." % Text, fh) result += value index += len(numeral) if index < len(input): # There are characters unaccounted for. error.log("input 0r%s is not a valid roman numeral." % Text, fh) return result
def __handle_property_match(cl): property_follower = Setup.query_property_match if not property_follower: return sys.stderr.write("(please, wait for database parsing to complete)\n") fields = map(lambda x: x.strip(), property_follower.split("=")) if len(fields) != 2: error.log("Wrong property setting '%s'." % property_follower) # -- determine name and value name = fields[0] wild_card_expression = fields[1] # -- get the property from the database property = __get_property(name) if property is None: return True # -- find the character set for the given expression if property.type == "Binary": error.log("Binary property '%s' is not subject to value wild card matching.\n" % property.name) for value in property.get_wildcard_value_matches(wild_card_expression): print value
def read_identifier(fh, TolerantF=False, OnMissingStr=None): def __read(fh, TolerantF): txt = fh.read(1) if len(txt) == 0: return "" if TolerantF: if is_identifier_continue(txt) == False: fh.seek(-1, 1) return "" else: if is_identifier_start(txt) == False: fh.seek(-1, 1) return "" while 1 + 1 == 2: tmp = fh.read(1) if len(tmp) == 0: return txt if is_identifier_continue(tmp): txt += tmp else: fh.seek(-1, 1) return txt result = __read(fh, TolerantF) if len(result) == 0 and OnMissingStr is not None: error.log(OnMissingStr, fh) return result
def read_integer(fh): pos = fh.tell() base, digit_list = get_number_base(fh) if base is None: return None txt = "" while 1 + 1 == 2: tmp = fh.read(1) if tmp == "": break elif tmp not in digit_list: fh.seek(-1, 1) break txt += tmp # If we drop out on a digit, then let us assume that the user just missed a point if tmp.isdigit() or (tmp in list("ABCDEFabcdef")): error.log( "Digit '%s' cannot be part of an expression of base %s." % (tmp, base), fh) txt = txt.replace(".", "") if len(txt) == 0: if base in [2, 8, 16, "roman", "Napier"]: error.log( "Missing digits after for number of base %s, found '%s'." % (str(base), tmp), fh) fh.seek(pos) return None # Octal, decimal, and hexadecimal numbers if base in [2, 8, 10, 16]: return int(txt, base) elif base == "roman": return __roman_number(txt, fh) elif base == "Napier": return __napier_number(txt, fh) else: return __binary_number(txt, fh)
def optional_flags(fh, SectionName, Default, AdmissibleDb, BadCombinationList): pos = fh.tell() if not check(fh, "("): return Default flag_txt = read_until_character(fh, ")").strip().replace(" ", "").replace( "\t", "").replace("\n", "").replace("\r", "") for letter in flag_txt: if letter in AdmissibleDb: continue fh.seek(pos) explanation_txt = [ "'%s' for %s." % (flag, explanation) for flag, explanation in AdmissibleDb.iteritems() ] explanation_str = "Options are: "\ + "\n ".join(explanation_txt) error.log("Flag '%s' not permitted for %s.\n" % (letter, SectionName) + \ explanation_str, fh) # Bad combination check: for bad_combination in BadCombinationList: suspect_list = [flag for flag in bad_combination if flag in flag_txt] if len(suspect_list) > 1: suspect_list.sort() error.log( "Flag '%s' and '%s' cannot be used\n" "at the same time in %s." % (suspect_list[0], suspect_list[1], SectionName), fh) return flag_txt
def test(Text): global count_n count_n += 1 if Text.find("\n") == -1: print "(%i) |%s|\n" % (count_n, Text) else: print "(%i)\n::\n%s\n::\n" % (count_n, Text) sh = StringIO(Text) sh.name = "test_string" descr = None try: descr = IndentationCount.from_FileHandle(sh) except EndOfStreamException: error.log("End of file reached while parsing 'indentation' section.", sh, DontExitF=True) except: print "Exception!" if descr is not None: print descr print
def __check_on_orphan_states(Place, sm): orphan_state_list = sm.get_orphaned_state_index_list() if len(orphan_state_list) == 0: return error.log("After '%s'" % Place + "\n" + \ "Orphaned state(s) detected in regular expression (optimization lack).\n" + \ "Please, log a defect at the projects website quex.sourceforge.net.\n" + \ "Orphan state(s) = " + repr(orphan_state_list))
def __parse_base_mode_list(fh, new_mode): new_mode.derived_from_list = [] trailing_comma_f = False while 1 + 1 == 2: if check(fh, "{"): fh.seek(-1, 1) break elif check(fh, "<"): fh.seek(-1, 1) break skip_whitespace(fh) identifier = read_identifier(fh) if identifier == "": break new_mode.derived_from_list.append(identifier) trailing_comma_f = False if not check(fh, ","): break trailing_comma_f = True if trailing_comma_f: error.warning( "Trailing ',' after base mode '%s'." % new_mode.derived_from_list[-1], fh) elif len(new_mode.derived_from_list) != 0: # This check is a 'service' -- for those who follow the old convention pos = fh.tell() skip_whitespace(fh) dummy_identifier = read_identifier(fh) if dummy_identifier != "": error.log("Missing separating ',' between base modes '%s' and '%s'.\n" \ % (new_mode.derived_from_list[-1], dummy_identifier) + \ "(The comma separator is mandatory since quex 0.53.1)", fh) fh.seek(pos)
def check_grid_specification(self, Value, sr): if Value == 0: error.log("A grid count of 0 is nonsense. May be define a space count of 0.", sr) elif Value == 1: error.warning("Indentation grid counts of '1' are equivalent of to a space\n" + \ "count of '1'. The latter is faster to compute.", sr)
def _base_parse(self, fh, IndentationSetupF=False): """Parses pattern definitions of the form: [ \t] => grid 4; [:intersection([:alpha:], [\X064-\X066]):] => space 1; In other words the right hand side *must* be a character set. ADAPTS: result to contain parsing information. """ # NOTE: Catching of EOF happens in caller: parse_section(...) # while 1 + 1 == 2: skip_whitespace(fh) if check(fh, ">"): break # A regular expression state machine pattern, identifier, sr = _parse_definition_head(fh, self.identifier_list) if pattern is None and IndentationSetupF: error.log("Keyword '\\else' cannot be used in indentation setup.", fh) # '_parse_definition_head()' ensures that only identifiers mentioned in # 'result' are accepted. if self.requires_count(): count = _read_value_specifier(fh, identifier, 1) self.specify(identifier, pattern, count, sr) else: self.specify(identifier, pattern, sr) if not check(fh, ";"): error.log("Missing ';' after '%s' specification." % identifier, fh) return self.finalize()
def adapt_source_and_drain_range(self, LexatomByteN): """The drain range may be restricted due to the number of bytes given per lexatom. If the 'LexatomByteN' is '-1' it is unrestricted which may be useful for unit tests and theoretical investigations. DERIVED CLASS MAY HAVE TO WRITE A DEDICATED VERSION OF THIS FUNCTION TO MODIFY THE SOURCE RANGE '.source_set'. """ if LexatomByteN == -1: self.lexatom_range = Interval_All() return assert LexatomByteN >= 1 lexatom_min_value = self.drain_set.minimum() lexatom_max_value = self.drain_set.supremum() - 1 if LexatomByteN != -1: try: value_n = 256 ** LexatomByteN except: error.log("Error while trying to compute 256 power the 'lexatom-size' (%i bytes)\n" \ % LexatomByteN + \ "Adapt \"--buffer-element-size\" or \"--buffer-element-type\",\n" + \ "or specify '--buffer-element-size-irrelevant' to ignore the issue.") lexatom_min_value = 0 lexatom_max_value = min(lexatom_max_value, value_n - 1) lexatom_max_value = min(lexatom_max_value, sys.maxint) assert lexatom_max_value > lexatom_min_value self.lexatom_range = Interval(lexatom_min_value, lexatom_max_value + 1) self.drain_set.mask_interval(self.lexatom_range)
def parse(fh, CodeFragmentName, ErrorOnFailureF=True, AllowBriefTokenSenderF=True, ContinueF=True): """RETURNS: An object of class CodeUser containing line number, filename, and the code fragment. None in case of failure. """ assert type(ErrorOnFailureF) == bool assert type(AllowBriefTokenSenderF) == bool skip_whitespace(fh) word = fh.read(2) if len(word) >= 1 and word[0] == "{": if len(word) > 1: fh.seek(-1, 1) # unput the second character return __parse_normal(fh, CodeFragmentName) elif AllowBriefTokenSenderF and word == "=>": return __parse_brief_token_sender(fh, ContinueF) elif not ErrorOnFailureF: fh.seek(-2,1) return None else: error.log("Missing code fragment after %s definition." % CodeFragmentName, fh)
def consistency_check(self): self.count_command_map.check_defined(self.sr, E_CharacterCountType.WHITESPACE) self.count_command_map.check_defined(self.sr, E_CharacterCountType.BEGIN_NEWLINE) if self.sm_newline_suppressor.get() is not None: if self.sm_newline.get() is None: error.log("A newline 'suppressor' has been defined.\n" "But there is no 'newline' in indentation defintion.", self.sm_newline_suppressor.sr)
def open_data_base_file(Filename): try: fh = open(unicode_db_directory + "/" + Filename, "rb") except: error.log("Fatal---Unicode Database File '%s' not found!\n" % Filename + \ "QUEX_PATH='%s'\n" % QUEX_PATH + \ "Unicode Database Directory: '%s'" % unicode_db_directory) return fh
def argv_catch_string(Cl, Option, Type): Cl.reset_cursor() value = Cl.follow("##EMPTY##", Option) if value == "##EMPTY##": if Type == SetupParTypes.OPTIONAL_STRING: value = "" else: error.log("Option %s\nnot followed by anything." % str(Option)[1:-1]) return value
def __compile_regular_expression(Str, Name): tmp = Str.replace("*", "\\*") tmp = tmp.replace("?", "\\?") tmp = tmp.replace("{", "\\{") tmp = tmp.replace("}", "\\}") try: return re.compile(tmp) except: error.log("Invalid %s: %s" % (Name, Str))
def search_and_validate(CL, Option): if CL.search(Option) == False: return False # Validate command line ufos = CL.unidentified_options(OPTION_DB.keys()) if len(ufos) != 0: error.log("Unidentified option(s) = " + repr(ufos) + "\n" + \ get_supported_command_line_option_description()) return True
def add(self, CharSet, Identifier, Value, sr): global cc_type_db if CharSet.is_empty(): error.log("Empty character set found for '%s'." % Identifier, sr) elif Identifier == "grid": self.check_grid_specification(Value, sr) cc_type = cc_type_db[Identifier] self.check_intersection(cc_type, CharSet, sr) self.__map.append((CharSet, CountAction(cc_type, Value, sr)))