def extract_sgml_tokens(self, data):
        """
        Internal: Extract tokens from inside SGML tag.

        data - SGML tag String.

            Examples

              extract_sgml_tokens("<a href='' class=foo>")
              # => ["<a>", "href="]

        Returns Array of token Strings.
        """
        s = StringScanner(data)
        tokens = []
        append = tokens.append

        while not s.is_eos:
            # Emit start token
            token = s.scan(REGEX_EMIT_START_TOKEN)
            if token:
                append(token + '>')
                continue

            # Emit attributes with trailing =
            token = s.scan(REGEX_EMIT_TRAILING)
            if token:
                append(token)

                # Then skip over attribute value
                if s.scan(REGEX_DOUBLE_QUOTE):
                    s.skip_until(REGEX_DOUBLE_END_QUOTE)
                    continue
                if s.scan(REGEX_SINGLE_QUOTE):
                    s.skip_until(REGEX_SINGLE_END_QUOTE)
                    continue
                s.skip_until(REGEX_EMIT_WORD)
                continue

            # Emit lone attributes
            token = s.scan(REGEX_EMIT_WORD)
            if token:
                append(token)

            # Stop at the end of the tag
            if s.scan(REGEX_EMIT_END_TAG):
                s.terminate
                continue

            s.getch

        return tokens
    def extract_sgml_tokens(self, data):
        """
        Internal: Extract tokens from inside SGML tag.

        data - SGML tag String.

            Examples

              extract_sgml_tokens("<a href='' class=foo>")
              # => ["<a>", "href="]

        Returns Array of token Strings.
        """
        s = StringScanner(data)
        tokens = []
        append = tokens.append

        while not s.is_eos:
            # Emit start token
            token = s.scan(REGEX_EMIT_START_TOKEN)
            if token:
                append(token + '>')
                continue

            # Emit attributes with trailing =
            token = s.scan(REGEX_EMIT_TRAILING)
            if token:
                append(token)

                # Then skip over attribute value
                if s.scan(REGEX_DOUBLE_QUOTE):
                    s.skip_until(REGEX_DOUBLE_END_QUOTE)
                    continue
                if s.scan(REGEX_SINGLE_QUOTE):
                    s.skip_until(REGEX_SINGLE_END_QUOTE)
                    continue
                s.skip_until(REGEX_EMIT_WORD)
                continue

            # Emit lone attributes
            token = s.scan(REGEX_EMIT_WORD)
            if token:
                append(token)

            # Stop at the end of the tag
            if s.scan(REGEX_EMIT_END_TAG):
                s.terminate
                continue

            s.getch

        return tokens
    def extract_tokens(self, data):
        """
        Internal: Extract generic tokens from data.

        data - String to scan.

        Examples

          extract_tokens("printf('Hello')")
          # => ['printf', '(', ')']

        Returns Array of token Strings.
        """
        s = StringScanner(data)
        tokens = []
        while not s.is_eos:
            if s.pos >= BYTE_LIMIT:
                break
            token = s.scan(REGEX_SHEBANG)
            if token:
                name = self.extract_shebang(token)
                if name:
                    tokens.append('SHEBANG#!%s' % name)
                continue

            # Single line comment
            if s.is_bol and s.scan(START_SINGLE_LINE_COMMENT):
                s.skip_until(REGEX_BOL)
                continue

            # Multiline comments
            token = s.scan(START_MULTI_LINE_COMMENT)
            if token:
                close_token = MULTI_LINE_COMMENT_DICT[token]
                s.skip_until(close_token)
                continue

            # Skip single or double quoted strings
            if s.scan(REGEX_DOUBLE_QUOTE):
                if s.peek(1) == '"':
                    s.getch
                else:
                    s.skip_until(REGEX_DOUBLE_END_QUOTE)
                continue
            if s.scan(REGEX_SINGLE_QUOTE):
                if s.peek(1) == "'":
                    s.getch
                else:
                    s.skip_until(REGEX_SINGLE_END_QUOTE)
                continue

            # Skip number literals
            if s.scan(REGEX_NUMBER_LITERALS):
                continue

            # SGML style brackets
            token = s.scan(REGEX_SGML)
            if token:
                for t in self.extract_sgml_tokens(token):
                    tokens.append(t)
                continue

            # Common programming punctuation
            token = s.scan(REGEX_COMMON_PUNCTUATION)
            if token:
                tokens.append(token)
                continue

            # Regular token
            token = s.scan(REGEX_REGULAR_TOKEN)
            if token:
                tokens.append(token)
                continue

            # Common operators
            token = s.scan(REGEX_COMMON_OPERATORS)
            if token:
                tokens.append(token)
                continue

            s.getch
        return tokens
    def extract_tokens(self, data):
        """
        Internal: Extract generic tokens from data.

        data - String to scan.

        Examples

          extract_tokens("printf('Hello')")
          # => ['printf', '(', ')']

        Returns Array of token Strings.
        """
        s = StringScanner(data)
        tokens = []
        while not s.is_eos:
            if s.pos >= BYTE_LIMIT:
                break
            token = s.scan(REGEX_SHEBANG)
            if token:
                name = self.extract_shebang(token)
                if name:
                    tokens.append('SHEBANG#!%s' % name)
                continue

            # Single line comment
            if s.is_bol and s.scan(START_SINGLE_LINE_COMMENT):
                s.skip_until(REGEX_BOL)
                continue

            # Multiline comments
            token = s.scan(START_MULTI_LINE_COMMENT)
            if token:
                close_token = MULTI_LINE_COMMENT_DICT[token]
                s.skip_until(close_token)
                continue

            # Skip single or double quoted strings
            if s.scan(REGEX_DOUBLE_QUOTE):
                if s.peek(1) == '"':
                    s.getch
                else:
                    s.skip_until(REGEX_DOUBLE_END_QUOTE)
                continue
            if s.scan(REGEX_SINGLE_QUOTE):
                if s.peek(1) == "'":
                    s.getch
                else:
                    s.skip_until(REGEX_SINGLE_END_QUOTE)
                continue

            # Skip number literals
            if s.scan(REGEX_NUMBER_LITERALS):
                continue

            # SGML style brackets
            token = s.scan(REGEX_SGML)
            if token:
                for t in self.extract_sgml_tokens(token):
                    tokens.append(t)
                continue

            # Common programming punctuation
            token = s.scan(REGEX_COMMON_PUNCTUATION)
            if token:
                tokens.append(token)
                continue

            # Regular token
            token = s.scan(REGEX_REGULAR_TOKEN)
            if token:
                tokens.append(token)
                continue

            # Common operators
            token = s.scan(REGEX_COMMON_OPERATORS)
            if token:
                tokens.append(token)
                continue

            s.getch
        return tokens