Beispiel #1
0
 def _error_token(self, fsm_str):
     # Return error token matching the input
     match_list = self.match_error_types(fsm_str)
     if len(match_list) > 1:
         slice_end = min(self._max_lookahead, len(fsm_str))
         raise RuntimeError('internal error: can\'t determine error ' \
                          + 'type for FSM string starting with %r.\n' \
                          + 'The matching errors are: %r.\n' \
                          + 'Revise error definitions for the tokenizer.'\
                          % (fsm_str[:slice_end], match_list) )
     elif len(match_list) < 1:
         slice_end = min(self._max_lookahead, len(fsm_str))
         raise RuntimeError('internal error: can\'t determine error ' \
                          + 'type for FSM string starting with %r.\n' \
                          + 'No matching errors found.\n' \
                          + 'Revise error definitions for the tokenizer.'\
                          % fsm_str[:slice_end] )
     else:
         _id, Error, match = match_list[0]
         s = self._get_buffer().shift(len(match.group(0)))
         token = Error(s.line_no(), s.col_no(), s.char_no())
         token.set_value(s.__str__())
         return token
Beispiel #2
0
    def token(self):
        """Try to form next token from input.
        
        This function reads text from input recently bound with `bind()` and
        tries to recognize next token.
        
        :Return:
            recognized token or one of predefined error tokens (see
            `_error_token()`)
        """
        eoi = False
        buff = self._get_buffer()
        iter_counter = 0
        while iter_counter < self._max_iterations:
            iter_counter += 1
            # Assure, than there is at least _max_lookahead characters
            # in the buffer (except we are approaching end of input)
            while (not eoi) and (len(buff) < self._max_lookahead):
                eoi = (buff.extend() < buff._default_chunk)

            # If there is no content to scan, return None
            if len(buff) == 0:
                return None
                
            fsm_str = buff.fsm_str()
            fsm_str_len = len(fsm_str)

            assert(fsm_str_len == len(buff))
            assert(fsm_str_len > 0)

            # Look for tokens matching the current FSM string starting
            # at its beginning. The match_list is a list of matching
            # tokens in form [(id0, Token0, match0), (id1, Token1,
            # match1), ...], where idN is an identifier of given token,
            # TokenN is tokens's type, such that you may create instance
            # with token = TokenN(...), and matchN is MatchObject returned
            # by re.match().
            match_list = self.match_token_types(fsm_str)
            match_list_len = len(match_list)
            if match_list_len == 0:
                # If no token matches current input, then we must emit
                # error token. The _error_token() also shifts part of buffer
                # that enters the returned error token.
                return self._error_token(fsm_str)
            elif match_list_len > 0:
                # Extract lengths of match objects from match_list
                match_len_list = map(lambda m : len(m[2].group(0)), match_list)
                # How many tokens matching whole buffer?
                whole_match_count = match_len_list.count(fsm_str_len)
                if (whole_match_count == 0) or eoi:
                    # We have only tokens matching substrings of FSM string but
                    # no token matches whole string. It is, all these tokens
                    # are complete, and all of them are candidates for the
                    # token to be returned. We chose (the longest) one and
                    # return it.
                    match_len_max = max(match_len_list)
                    match_len_max_count = match_len_list.count(match_len_max)
                    if match_len_max_count > 1:
                        # If there is more than one "longest token", then this
                        # is ambiguity, and we should indicate an error.
                        # Interpret this, as incomplete token (which could be
                        # disambiguated with few more characters).
                        return self._error_token(fsm_str) 
                    elif match_len_max_count == 1:
                        index = match_len_list.index(match_len_max)
                        _id, Token, match =  match_list[index]
                        s = buff.shift(match_len_max)
                        token = Token(s.line_no(), s.col_no(), s.char_no())
                        token.set_value(s.__str__())
                        return token
                    else:
                        raise RuntimeError(
                            'internal error: unexpected situation in '\
                          + 'tokenizer: %d tokens recognized, ' \
                          + 'but %d tokens would have maximum length' \
                          % (match_list_len, match_len_max_count))
                elif whole_match_count > 0: # and not eoi
                    # read more characters from input and let the loop to try
                    # to recover complete token
                    eoi = (buff.extend() < buff._default_chunk)
                else:
                    raise RuntimeError(
                        'internal error: we found %d tokens matching whole ' \
                        'buffer (a number less than zero!)' \
                        % whole_match_count ) 
            else:
                raise RuntimeError('internal error: number of ' \
                                 + 'candidates for token less than ' \
                                 + 'zero: %d' % match_list_len)

        raise RuntimeError('internal error: loop interrupted at iteration %d' \
                         % iter_counter )