def __init__( self, string_to_tokenize = '', prefix_chars = '-=<>!+*&|/%^', suffix_chars = '=<>&|' ): Tokenizer.__init__( self, string_to_tokenize ) self.prefix = prefix_chars self.suffix = suffix_chars ### Setup JavaScriptTokenizer-specific regexen self.PREFIX = re.compile( "[%s]" % self.prefix ) self.SUFFIX = re.compile( "[%s]" % self.suffix ) self.BEGIN_IDENTIFIER = self.CHARACTER self.MULTILINE_COMMENT = re.compile("[\*]") self.END_COMMENT = re.compile("[/]") self.ESCAPE = re.compile("[\\\\]")
def __init__( self, string_to_tokenize = '' ): Tokenizer.__init__( self, string_to_tokenize ) ### Setup CSSTokenizer-specific regexen ### Throwing everything away after reading through the CSS spec. ### I ought be using the specified tokens, so I will. # IDENT {ident} # ATKEYWORD @{ident} # STRING {string} # INVALID {invalid} # HASH #{name} # NUMBER {num} # PERCENTAGE {num}% # DIMENSION {num}{ident} # URI url\({w}{string}{w}\) # |url\({w}([!#$%&*-~]|{nonascii}|{escape})*{w}\) # UNICODE-RANGE U\+[0-9a-f?]{1,6}(-[0-9a-f]{1,6})? # CDO <!-- # CDC --> # ; ; # { \{ # } \} # ( \( # ) \) # [ \[ # ] \] # S [ \t\r\n\f]+ # COMMENT \/\*[^*]*\*+([^/*][^*]*\*+)*\/ # FUNCTION {ident}\( # INCLUDES ~= # DASHMATCH |= # DELIM any other character not matched by the above rules, and neither a single nor a double quote # # # ident [-]?{nmstart}{nmchar}* # name {nmchar}+ # nmstart [_a-z]|{nonascii}|{escape} # nonascii [^\0-\177] # unicode \\[0-9a-f]{1,6}(\r\n|[ \n\r\t\f])? # escape {unicode}|\\[^\n\r\f0-9a-f] # nmchar [_a-z0-9-]|{nonascii}|{escape} # num [0-9]+|[0-9]*\.[0-9]+ # string {string1}|{string2} # string1 \"([^\n\r\f\\"]|\\{nl}|{escape})*\" # string2 \'([^\n\r\f\\']|\\{nl}|{escape})*\' # invalid {invalid1}|{invalid2} # invalid1 \"([^\n\r\f\\"]|\\{nl}|{escape})* # invalid2 \'([^\n\r\f\\']|\\{nl}|{escape})* # nl \n|\r\n|\r|\f # w [ \t\r\n\f]*
def __init__(self): Tokenizer.__init__(self)