def _error_token(self, fsm_str): # Return error token matching the input match_list = self.match_error_types(fsm_str) if len(match_list) > 1: slice_end = min(self._max_lookahead, len(fsm_str)) raise RuntimeError('internal error: can\'t determine error ' \ + 'type for FSM string starting with %r.\n' \ + 'The matching errors are: %r.\n' \ + 'Revise error definitions for the tokenizer.'\ % (fsm_str[:slice_end], match_list) ) elif len(match_list) < 1: slice_end = min(self._max_lookahead, len(fsm_str)) raise RuntimeError('internal error: can\'t determine error ' \ + 'type for FSM string starting with %r.\n' \ + 'No matching errors found.\n' \ + 'Revise error definitions for the tokenizer.'\ % fsm_str[:slice_end] ) else: _id, Error, match = match_list[0] s = self._get_buffer().shift(len(match.group(0))) token = Error(s.line_no(), s.col_no(), s.char_no()) token.set_value(s.__str__()) return token
def token(self): """Try to form next token from input. This function reads text from input recently bound with `bind()` and tries to recognize next token. :Return: recognized token or one of predefined error tokens (see `_error_token()`) """ eoi = False buff = self._get_buffer() iter_counter = 0 while iter_counter < self._max_iterations: iter_counter += 1 # Assure, than there is at least _max_lookahead characters # in the buffer (except we are approaching end of input) while (not eoi) and (len(buff) < self._max_lookahead): eoi = (buff.extend() < buff._default_chunk) # If there is no content to scan, return None if len(buff) == 0: return None fsm_str = buff.fsm_str() fsm_str_len = len(fsm_str) assert(fsm_str_len == len(buff)) assert(fsm_str_len > 0) # Look for tokens matching the current FSM string starting # at its beginning. The match_list is a list of matching # tokens in form [(id0, Token0, match0), (id1, Token1, # match1), ...], where idN is an identifier of given token, # TokenN is tokens's type, such that you may create instance # with token = TokenN(...), and matchN is MatchObject returned # by re.match(). match_list = self.match_token_types(fsm_str) match_list_len = len(match_list) if match_list_len == 0: # If no token matches current input, then we must emit # error token. The _error_token() also shifts part of buffer # that enters the returned error token. return self._error_token(fsm_str) elif match_list_len > 0: # Extract lengths of match objects from match_list match_len_list = map(lambda m : len(m[2].group(0)), match_list) # How many tokens matching whole buffer? whole_match_count = match_len_list.count(fsm_str_len) if (whole_match_count == 0) or eoi: # We have only tokens matching substrings of FSM string but # no token matches whole string. It is, all these tokens # are complete, and all of them are candidates for the # token to be returned. We chose (the longest) one and # return it. match_len_max = max(match_len_list) match_len_max_count = match_len_list.count(match_len_max) if match_len_max_count > 1: # If there is more than one "longest token", then this # is ambiguity, and we should indicate an error. # Interpret this, as incomplete token (which could be # disambiguated with few more characters). return self._error_token(fsm_str) elif match_len_max_count == 1: index = match_len_list.index(match_len_max) _id, Token, match = match_list[index] s = buff.shift(match_len_max) token = Token(s.line_no(), s.col_no(), s.char_no()) token.set_value(s.__str__()) return token else: raise RuntimeError( 'internal error: unexpected situation in '\ + 'tokenizer: %d tokens recognized, ' \ + 'but %d tokens would have maximum length' \ % (match_list_len, match_len_max_count)) elif whole_match_count > 0: # and not eoi # read more characters from input and let the loop to try # to recover complete token eoi = (buff.extend() < buff._default_chunk) else: raise RuntimeError( 'internal error: we found %d tokens matching whole ' \ 'buffer (a number less than zero!)' \ % whole_match_count ) else: raise RuntimeError('internal error: number of ' \ + 'candidates for token less than ' \ + 'zero: %d' % match_list_len) raise RuntimeError('internal error: loop interrupted at iteration %d' \ % iter_counter )