def tokenize(str): 'str -> Sequence(Token)' specs = [ ('comment', (r'//.*',)), ('newline', (r'[\r\n]+',)), ('space', (r'[ \t\r\n]+',)), ('name', (r'(?!(?:as|exists|priority|reaction)\b)[A-Za-z\200-\377_]([A-Za-z\200-\377_0-9])*',)), ('kw_exists', (r'exists',)), ('kw_reaction', (r'reaction',)), ('kw_as', (r'as',)), ('kw_priority', (r'priority',)), ('op_priority_maximal', (r'>>',)), ('op_tilde', (r'~',)), ('op_production', (r'::',)), ('op_dissolve', (r'\$',)), ('op_osmose_location', (r'!!',)), ('op_osmose', (r'!',)), ('mod_catalyst', (r'\*',)), ('mod_charge_positive', (r'\+',)), ('mod_charge_negative', (r'-',)), ('env_open', (r'\[',)), ('env_close', (r'\]',)), ('membrane_open', (r'\(',)), ('membrane_close', (r'\)',)), ('number', (r'-?(\.[0-9]+)|([0-9]+(\.[0-9]*)?)',)) ] useless = ['comment', 'space', 'newline'] t = make_tokenizer(specs) return [x for x in t(str) if x.type not in useless]
def tokenize(str): """ Generates a list of tokens from the given string. """ specs = [ ('Space', (r'[ \t\r\n]+',)), ('True', ('true',)), ('False', ('false',)), ('If', ('if',)), ('Then', ('then',)), ('Else', ('else',)), ('Fi', ('fi',)), ('Call', ('call',)), ('Lp', ('\(',)), ('Comma', (',',)), ('Rp', ('\)',)), ('Let', ('let',)), ('In', ('in',)), ('End', ('end',)), ('Fun', ('fun',)), ('Arrow', ('=>',)), ('Prog', ('prog',)), ('Op', (r'[\-+/*=<>]',)), ('Var', (r'[A-Za-z][A-Za-z_0-9]*',)), ('Number', (r'(0|([1-9][0-9]*))', VERBOSE)), ('Semicolon', (';',)), ] useless = ['Space'] t = make_tokenizer(specs) return [x for x in t(str) if x.type not in useless]
def tokenize(string): """ Generates a list of tokens from the given string. """ specs = [ ('Space', (r'[ \t\r\n]+',)), ('Or', ('\|\|',)), ('And', ('&&',)), ('Neq', ('!=',)), ('Not', ('!',)), ('Eq', ('==',)), ('Assign', (':=',)), ('Le', ('<=',)), ('Ge', ('>=',)), ('Dot', ('\.',)), ('Op', (r'[\-+/*=<>%]',)), ('Ident', (r'[A-Za-z][A-Za-z_0-9]*',)), ('Number', (r'(0|([1-9][0-9]*))', VERBOSE)), ('Semicolon', (';',)), ('Comma', (',',)), ('Lb', ('{',)), ('Rb', ('}',)), ('Lp', ('\(',)), ('Rp', ('\)',)), ('String', ('"[^"]*"',)), ] useless = ['Space'] t = make_tokenizer(specs) return [x for x in t(string) if x.type not in useless]
def tokenize(string): """str -> Sequence(Token)""" # flake8: NOQA specs = [ # NOQA ('Comment', (r'/\*(.|[\r\n])*?\*/', MULTILINE)), # NOQA ('Comment', (r'(//|#).*', )), # NOQA ('NL', (r'[\r\n]+', )), # NOQA ('QuotedRackItem', (r'(?<=[:*\-])\s*(?P<quote>"|\').*?(?<!\\)(?P=quote)', DOTALL)), # NOQA ('RackItem', (r'(?<=[:*\-])[^\r\n\[;}]+', )), # NOQA ('Space', (r'[ \t\r\n]+', )), # NOQA ('RackHeight', (r'[0-9]+U', )), # NOQA ('Units', (r'[0-9]+(?:\.[0-9]+)?(A|kg)', )), # NOQA ('Number', (r'[0-9]+', )), # NOQA ( 'Name', ( u('[A-Za-z_0-9\u0080-\uffff]') + # NOQA u('[A-Za-z_\\-.0-9\u0080-\uffff]*'), )), # NOQA ('Op', (r'[{}:;,*\-=\[\]]', )), # NOQA ('String', (r'(?P<quote>"|\').*?(?<!\\)(?P=quote)', DOTALL)), # NOQA ] useless = ['Comment', 'NL', 'Space'] t = make_tokenizer(specs) return [x for x in t(string) if x.type not in useless]
def tokenize(s): """Tokenize a type string. :type s: unicode """ f = make_tokenizer(token_specs) return [t for t in f(s) if t.type != 'space']
def token_phase(characters: str): specs = [ ("comment", (r"#.*",)), ("space", (r"[ \t\r]+",)), ( "float", ( r""" -? # Minus ([0-9]+) # Int (\.[0-9]+) # Frac ([Ee][+-]?[0-9]+)? # Exp""", VERBOSE, ), ), ("integer", (r"0|([1-9][0-9]*)",)), ("name", (r"[A-Za-z_][A-Za-z_0-9]*",)), ("grouping", (r"[\(\)\[\]\{\}]",)), ("operator", (r"[~!@#$%^&*<>:?/\\|\-\+=]+",)), ("prime", (r"'",)), ] useless = ["comment", "space"] tokenizer = make_tokenizer(specs) return tuple(token for token in tokenizer(characters) if token.type not in useless)
def tokenize( string ): """str -> Sequence(Token)""" # Basic Tokens Spec specs = [ ( 'Comment', ( r' *#.*', ) ), ( 'Space', ( r'[ \t\r\n]+', ) ), ( 'USBCode', ( r'U(("[^"]+")|(0x[0-9a-fA-F]+)|([0-9]+))', ) ), ( 'USBCodeStart', ( r'U\[', ) ), ( 'ScanCode', ( r'S((0x[0-9a-fA-F]+)|([0-9]+))', ) ), ( 'ScanCodeStart', ( r'S\[', ) ), ( 'CodeEnd', ( r'\]', ) ), ( 'String', ( r'"[^"]*"', VERBOSE ) ), ( 'SequenceString', ( r"'[^']*'", ) ), ( 'Operator', ( r'=>|:\+|:-|:|=', ) ), ( 'Comma', ( r',', ) ), ( 'Dash', ( r'-', ) ), ( 'Plus', ( r'\+', ) ), ( 'Parenthesis', ( r'\(|\)', ) ), ( 'Number', ( r'-?(0x[0-9a-fA-F]+)|(0|([1-9][0-9]*))', VERBOSE ) ), ( 'Name', ( r'[A-Za-z_][A-Za-z_0-9]*', ) ), ( 'VariableContents', ( r'''[^"' ;:=>()]+''', ) ), ( 'EndOfLine', ( r';', ) ), ] # Tokens to filter out of the token stream useless = ['Space', 'Comment'] tokens = make_tokenizer( specs ) return [x for x in tokens( string ) if x.type not in useless]
def tokenize(str): """str -> Sequence(Token)""" specs = [ ('COMMENT', (r'//.*', )), ('COMMENT', (r'/\*(.|[\r\n])*?\*/', re.MULTILINE)), ('NL', (r'[\r\n]+', )), ('SPACE', (r'[ \t\r\n]+', )), ('REAL', (r'[0-9]+\.[0-9]*([Ee][+\-]?[0-9]+)*', )), ('INT', (r'[0-9]+', )), ('INT', (r'\$[0-9A-Fa-f]+', )), ('OP', (r'(::)|(:)|(-)|(=)|(\()|(\))', )), ('FIELD', (r'(bool|char|date|datetime|decimal|duration|email|float|int|slug|text|time|url|uuid|fk|m2m|o2o)', )), ('NONE', (r'none', )), ('FALSE', (r'false', )), ('TRUE', (r'true', )), ('NAME', (r'([A-Za-z_.][A-Za-z_0-9.]*)', )), #('OP', (r'(\.\.)|(<>)|(<=)|(>=)|(:=)|[;,=\(\):\[\]\.+\-<>\*/@\^]',)), ('STRING', (r"'([^']|(''))*'", )), ('ESCAPES', (r',', )), ] useless = ['SPACE', 'NL', 'COMMENT', 'ESCAPES'] t = make_tokenizer(specs) ret = [x for x in t(str) if x.type not in useless] return ret
def tokenize(string): specs = [ ('Regex', (r'({.+}|<re>.+?</re>)', re.UNICODE)), ('Op', (r':',)), ('String', (r"[^<{|}>:][^<{|}>:]*", re.UNICODE)), ] tok = make_tokenizer(specs) return [t for t in tok(string)]
def lex_braced_expr(string): """Lex a braced expression.""" tokenizer = lexer.make_tokenizer([ ('lbrace', [r'{']), ('rbrace', [r'}']), ('content', [r'[^{}]+']), ]) return remove_whitespace_tokens(tokenizer(string))
def lex_date(date_string): """Lex a string into biblatex date tokens.""" tokenizer = lexer.make_tokenizer([ ('number', [r'[0-9]+']), ('dash', [r'-']), ('slash', [r'/']) ]) return tokenizer(date_string)
def tokenize(str): """Returns tokens of the given string.""" specs = [ ('Op', (r'[|\(\)\*]',)), ('Char', (r'[A-Za-z0-9]',)), ] useless = ['Space'] t = make_tokenizer(specs) return [x for x in t(str) if x.type not in useless]
def tokenize(s): specs = [ ('whitespace', (r'[ \t]', )), ('newline', (r'[\n]', )), ('instruction', (r'(inc|dec|zero|else|stop|else)', )), ('variable', (r'[xyz]', )), ('number', (r'[0-9]+', )), ] f = make_tokenizer(specs) return [t for t in f(s) if t.type != 'whitespace']
def str_tokenize(string): 'unicode -> Sequence(Token)' specs = [ ('JunkSpace', (r'[\r\n\t]+', )), ('Space', (r'[ ]+', )), ('Op', (r'[:/\[\]]', )), ('Name', (r'[^:/ \[\]\r\t\n]+', re.UNICODE)), ] useless = ['JunkSpace'] tok = make_tokenizer(specs) return [x for x in tok(string) if x.type not in useless]
def test_error_info(): tokenize = make_tokenizer([ Spec('keyword', r'(is|end)'), Spec('id', r'[a-z]+'), Spec('space', r'[ \t]+'), Spec('nl', r'[\n\r]+'), ]) try: list(tokenize(u'f is ф')) except LexerError, e: eq_(unicode(e), u'1,6-1,6: cannot tokenize data: "f is \u0444"')
def tokenize(string): """ str -> Sequence(Token) """ specs = [ ('Space', (r'[ \t\r\n]+', )), ('String', (r'"(%(unescaped)s | %(escaped)s)*"' % regexps, VERBOSE)), ('Op', (r'[\[\],()]', )), ('Name', (r'[A-Za-z_][A-Za-z_0-9]*', )), ] empty = ['Space'] t = make_tokenizer(specs) return [x for x in t(string) if x.type not in empty]
def str_tokenize(string): 'unicode -> Sequence(Token)' specs = [ ('JunkSpace', (r'[\r\n\t]+',)), ('Space', (r'[ ]+',)), ('Op', (r'[:/\[\]]',)), ('Name', (ur'[^:/ \[\]\r\t\n]+', re.UNICODE)), ] useless = ['JunkSpace'] tok = make_tokenizer(specs) return [x for x in tok(string) if x.type not in useless]
def tokenize(str): """Returns tokens of the given string.""" specs = [ ('Space', (r'[ \t\r\n]+',)), ('Dot', (r'\.',)), ('Name', (r'[A-Za-z_][A-Za-z_0-9]*',)), ('Lambda', (u'λ',)), ('Parentheses', (r'[\(\)]',)), ] useless = ['Space'] t = make_tokenizer(specs) return [x for x in t(str) if x.type not in useless]
def tokenize(to_tokenize_str): specs = [ ('Space', (r'[ \t\r\n]+', )), ('Word', (r'[\w\-\_]+', )), ('Op', (r'[\\(\\)\\|\\!\\&]{1}', )), ] useless = [u'Space'] tokenizer = make_tokenizer(specs) return [ token for token in tokenizer(to_tokenize_str) if token.type not in useless ]
def tokenize(input): token_specs = [ ('NAME', (r'[A-Za-z_][A-Za-z_0-9-]*',)), ('REGEXP', (r'/.*/',)), ('STRING', (r'"((\\")|[^"])*"',)), ('OP', (r'([{}\[\]?$:,|@%!/&]|\.{3})',)), ('NUMBER', (r'-?(0|[1-9]\d*)(\.\d+)?',)), ('COMMENT', (r'#.*',)), ('NL', (r'[\r\n]+([ \t]+[\r\n]+)*',)), ('SPACE', (r'[ \t]+',)) ] return indentation(make_tokenizer(token_specs)(input + "\n"))
def test_error_info(self): tokenize = make_tokenizer([ (u'keyword', (ur'(is|end)',)), (u'id', (ur'[a-z]+',)), (u'space', (ur'[ \t]+',)), (u'nl', (ur'[\n\r]+',)), ]) try: list(tokenize(u'f is ф')) except LexerError, e: self.assertEqual(unicode(e), u'cannot tokenize data: 1,6: "f is \u0444"')
def tokenize(to_tokenize_str): specs = [ ('Space', (r'[ \t\r\n]+',)), ('Word', (r'[\w\-\_]+',)), ('Op', (r'[\\(\\)\\|\\!\\&]{1}',)), ] useless = [u'Space'] tokenizer = make_tokenizer(specs) return [ token for token in tokenizer(to_tokenize_str) if token.type not in useless ]
def tokenize(str): """Returns tokens of the given string.""" specs = [ ('Space', (r'[ \t\r\n]+',)), ('Number', (r''' (0|([1-9][0-9]*)) # Int ''', VERBOSE)), ('Op', (r'[\-+/*\(\),]',)), ('Name', (r'[A-Za-z_][A-Za-z_0-9]*',)), ] useless = ['Space'] t = make_tokenizer(specs) return [x for x in t(str) if x.type not in useless]
def tokenizer(str): 'str -> Sequence(Token)' specs = [ ('space', (r'[ \t\r\n]+',)), ('int', (r'-?[1-9][0-9]*|0',)), ('true', (r'#t',)), ('false', (r'#f',)), ('char', (r'#\\[A-Za-z_0-9]',)), ('op', (r'[\[\]\(\)\']', re.VERBOSE)), ('name', (r'[A-Za-z_0-9\&\*\+\-\~!\=<>\^/,\?:;.]*',)), ] useless = ['space'] t = make_tokenizer(specs) return [x for x in t(str) if x.type not in useless]
def tokenize(s): regexps = { 'escaped': r'\\(?P<standard>["\\/bfnrt])', 'unescaped': r'[^"\\]' } grammar_specifications = [ ('space', (r'[ \t\r\n]+',)), ('number', (r'-?(0|([1-9][0-9]*))(\.[0-9]+)?([Ee][+-][0-9]+)?',)), ('string', (r'"[^"]*"',)), # unsupported escaped quotes ('operator', (r'(\*\*)|([><=!]=)|(and)|(or)|(not)|(in)|[{}\[\]\(\)\-\+\*/=><\.,:]',)), ('name', (r'[A-Za-z_][A-Za-z_0-9]*',)), ] t = make_tokenizer(grammar_specifications) return [x for x in t(s) if x.type not in ['space']]
def tokenize(str): """str -> Sequence(Token)""" specs = [ ('Comment', (r'/\*(.|[\r\n])*?\*/', MULTILINE)), ('Comment', (r'//.*', )), ('NL', (r'[\r\n]+', )), ('Space', (r'[ \t\r\n]+', )), ('Name', (r'[A-Za-z\200-\377_][A-Za-z\200-\377_0-9]*', )), ('Op', (r'[{};,=\[\]]|(->)|(--)', )), ('Number', (r'-?(\.[0-9]+)|([0-9]+(\.[0-9]*)?)', )), ('String', (r'"[^"]*"', )), # '\"' escapes are ignored ] useless = ['Comment', 'NL', 'Space'] t = make_tokenizer(specs) return [x for x in t(str) if x.type not in useless]
def tokenize(str): 'str -> Sequence(Token)' specs = [ make_multiline_comment(r'/\*', r'\*/'), make_comment(r'//'), newline, space, Spec('name', r'[A-Za-z\200-\377_][A-Za-z\200-\377_0-9]*'), Spec('op', r'[{};,=\[\]]|(->)|(--)'), Spec('number', r'-?(\.[0-9]+)|([0-9]+(\.[0-9]*)?)'), Spec('string', r'"[^"]*"'), # '\"' escapes are ignored ] useless = ['comment', 'newline', 'space'] t = make_tokenizer(specs) return [x for x in t(str) if x.type not in useless]
def lex_string_expr(string): """Lex a string expression.""" tokenizer = lexer.make_tokenizer([ ('concat', [r'#']), ('string', [r'"[^"]+"']), ('name', [r'[A-Za-z_][A-Za-z_0-9\-:?\'\.\s]*']), ('space', [r'[ \t\r\n]+']), ]) try: return remove_whitespace_tokens(tokenizer(string)) except lexer.LexerError: # If we fail to lex the string, it is not a valid string expression so # just return it as a single token return [Token('string', string)]
def tokenize(s): """str -> Sequence(Token)""" specs = [ (u'Space', (ur'[ \t\r\n]+', )), (u'String', (ur'"(%(unescaped_str)s | %(escaped)s)*"' % regexps, re.VERBOSE)), (u'Regex', (ur'/(%(unescaped_regex)s | %(escaped_regex)s)*/[i]*' % regexps, re.VERBOSE)), (u'Op', (ur'or|and|not|[\(\)]', )), (u'Prefix', (ur'client:|server:|any:', )), ] useless = [u'Space'] t = make_tokenizer(specs) return [x for x in t(s) if x.type not in useless]
def tokenize(str): """str -> Sequence(Token)""" specs = [ ('Comment', (r'/\*(.|[\r\n])*?\*/', MULTILINE)), ('Comment', (r'//.*',)), ('NL', (r'[\r\n]+',)), ('Space', (r'[ \t\r\n]+',)), ('Name', (r'[A-Za-z\200-\377_][A-Za-z\200-\377_0-9]*',)), ('Op', (r'[{};,=\[\]]|(->)|(--)',)), ('Number', (r'-?(\.[0-9]+)|([0-9]+(\.[0-9]*)?)',)), ('String', (r'"[^"]*"',)), # '\"' escapes are ignored ] useless = ['Comment', 'NL', 'Space'] t = make_tokenizer(specs) return [x for x in t(str) if x.type not in useless]
def tokenize(str): 'str -> Sequence(Token)' specs = [ ('Comment', (r'/\*(.|[\r\n])*?\*/', MULTILINE)), ('Comment', (r'//.*',)), ('NL', (r'[\r\n]+',)), ('Space', (r'[ \t\r\n]+',)), ('Name', (ur'[A-Za-z_\u0080-\uffff][A-Za-z_0-9\.\u0080-\uffff]*',)), ('Op', (r'[{}():;,=\[\]]',)), ('Color', (r'[A-Za-z0-9]+',)), ('Number', (r'-?(\.[0-9]+)|([0-9]+(\.[0-9]*)?)',)), ('String', (r'(?P<quote>"|\').*?(?<!\\)(?P=quote)', DOTALL)), ] useless = ['Comment', 'NL', 'Space'] t = make_tokenizer(specs) return [x for x in t(str) if x.type not in useless]
def create_tokenizer(): specs = [ ('Space', (r'[ \t\r\n]+', )), ('String', (r'"(%(unescaped)s | %(escaped)s)*"' % REGEXPS, re.VERBOSE)), ('Number', (r''' -? # Minus (0|([1-9][0-9]*)) # Int (\.[0-9]+)? # Frac ([Ee][+-][0-9]+)? # Exp ''', re.VERBOSE)), ('Op', (r'[{}\[\]\-,:]', )), ('Name', (r'[A-Za-z_][A-Za-z_0-9]*', )), ] return make_tokenizer(specs)
def tokenize(s): specs = [ ('comment', (r'#.*',)), ('newline', (r'[\r\n]+',)), ('space', (r'[ \t\r\n]+',)), ('name', (r'[a-zA-Z_][a-zA-Z_0-9]*',)), # _FooBar9_Baz ('number', (r'-?(\.[0-9]+)|([0-9]+(\.[0-9]*)?)',)), # -3.1415 ('op', (r'[\+\-/\*\(\),]',)), # + - / * ( ) , ] f = make_tokenizer(specs) useless = ['comment', 'newline', 'space'] try: return [tok for tok in f(s) if tok.type not in useless] except LexerError, e: logger.error(unicode(e)) raise
def tokenize(str): 'str -> Sequence(Token)' specs = [ ('Comment', (r'/\*(.|[\r\n])*?\*/', MULTILINE)), ('Comment', (r'//.*', )), ('NL', (r'[\r\n]+', )), ('Space', (r'[ \t\r\n]+', )), ('Name', (ur'[A-Za-z_\u0080-\uffff][A-Za-z_0-9\.\u0080-\uffff]*', )), ('Op', (r'[{}():;,=\[\]]', )), ('Color', (r'[A-Za-z0-9]+', )), ('Number', (r'-?(\.[0-9]+)|([0-9]+(\.[0-9]*)?)', )), ('String', (r'(?P<quote>"|\').*?(?<!\\)(?P=quote)', DOTALL)), ] useless = ['Comment', 'NL', 'Space'] t = make_tokenizer(specs) return [x for x in t(str) if x.type not in useless]
def tokenize(str): specs = [ ('With', (r'WITH',)), ('In', (r'IN',)), ('Set', (r'SET',)), ('Equals', (r'=',)), ('Space', (r'[ \t\r\n]+',)), ('Value', (r'\".*?\"',)), ('Attribute', (r'[A-Za-z][A-Za-z0-9]*',)), ] useless = ['Space'] return list( filter( lambda x: x.type not in useless, make_tokenizer(specs)(str) ) )
def tokenize(string): """str -> Sequence(Token)""" # flake8: NOQA specs = [ # NOQA ('Comment', (r'/\*(.|[\r\n])*?\*/', MULTILINE)), # NOQA ('Comment', (r'(//|#).*',)), # NOQA ('NL', (r'[\r\n]+',)), # NOQA ('Space', (r'[ \t\r\n]+',)), # NOQA ('Name', ('[A-Za-z_0-9\u0080-\uffff]' + # NOQA '[A-Za-z_\\-.0-9\u0080-\uffff]*',)), # NOQA ('Op', (r'[{};,=\[\]]|(<->)|(<-)|(--)|(->)|(>-<)|(-<)|(>-)',)), # NOQA ('Number', (r'-?(\.[0-9]+)|([0-9]+(\.[0-9]*)?)',)), # NOQA ('String', (r'(?P<quote>"|\').*?(?<!\\)(?P=quote)', DOTALL)), # NOQA ] useless = ['Comment', 'NL', 'Space'] t = make_tokenizer(specs) return [x for x in t(string) if x.type not in useless]
def tokenize(s): regexps = { 'escaped': r'\\(?P<standard>["\\/bfnrt])', 'unescaped': r'[^"\\]' } grammar_specifications = [ ('space', (r'[ \t\r\n]+', )), ('number', (r'-?(0|([1-9][0-9]*))(\.[0-9]+)?([Ee][+-][0-9]+)?', )), ('string', (r'"[^"]*"', )), # unsupported escaped quotes ('operator', (r'(\*\*)|([><=!]=)|(and)|(or)|(not)|(in)|[{}\[\]\(\)\-\+\*/=><\.,:]', )), ('name', (r'[A-Za-z_][A-Za-z_0-9]*', )), ] t = make_tokenizer(grammar_specifications) return [x for x in t(s) if x.type not in ['space']]
def tokenize(str): 'str -> Sequence(Token)' specs = [ Spec('space', r'[ \t\r\n]+'), Spec('string', ur'"(%(unescaped)s | %(escaped)s)*"' % regexps, VERBOSE), Spec('number', r''' -? # Minus (0|([1-9][0-9]*)) # Int (\.[0-9]+)? # Frac ([Ee][+-][0-9]+)? # Exp ''', VERBOSE), Spec('op', r'[{}\[\]\-,:]'), Spec('name', r'[A-Za-z_][A-Za-z_0-9]*'), ] useless = ['space'] t = make_tokenizer(specs) return [x for x in t(str) if x.type not in useless]
def tokenize(str): """str -> Sequence(Token)""" specs = [(u'keyword', (ur'({|}|def|context|environment|\.|\(|\))', )), (u'Space', (ur'[ \t\r\n]+', )), (u'String', (ur'"(%(unescaped)s | %(escaped)s)*"' % regexps, VERBOSE)), (u'Number', (ur''' -? # Minus (0|([1-9][0-9]*)) # Int (\.[0-9]+)? # Frac ([Ee][+-][0-9]+)? # Exp ''', VERBOSE)), (u'Eq', (ur'=', )), (u'Sep', (ur',', )), (u'Op', (ur'[{}\[\]\-,:]', )), (u'Name', (ur'[A-Za-z_][A-Za-z_0-9]*', ))] useless = [u'Space'] t = make_tokenizer(specs) return [x for x in t(str) if x.type not in useless]
def tokenize(str): 'str -> Sequence(Token)' specs = [ Spec('space', r'[ \t\r\n]+'), Spec('number', r''' -? # Minus (0|([1-9][0-9]*)) # Int (\.[0-9]+)? # Frac ([Ee][+-][0-9]+)? # Exp (L)? # Long ''', VERBOSE), Spec('op', r'[()\[\]\-,:]'), Spec('name', r'[A-Za-z_][A-Za-z_]*'), ] useless = ['space'] t = make_tokenizer(specs) return [x for x in t(str) if x.type not in useless]
def tokenize(string): """str -> Sequence(Token)""" # flake8: NOQA specs = [ # NOQA ('Comment', (r'/\*(.|[\r\n])*?\*/', MULTILINE)), # NOQA ('Comment', (r'(//|#).*',)), # NOQA ('NL', (r'[\r\n]+',)), # NOQA ('Space', (r'[ \t\r\n]+',)), # NOQA ('Name', (u('[A-Za-z_0-9\u0080-\uffff]') + # NOQA u('[A-Za-z_\\-.0-9\u0080-\uffff]*'),)), # NOQA ('Op', (r'[{};,=\[\]]|(<->)|(<-)|(--)|(->)',)), # NOQA ('Number', (r'-?(\.[0-9]+)|([0-9]+(\.[0-9]*)?)',)), # NOQA ('String', (r'(?P<quote>"|\').*?(?<!\\)(?P=quote)', DOTALL)), # NOQA ] useless = ['Comment', 'NL', 'Space'] t = make_tokenizer(specs) return [x for x in t(string) if x.type not in useless]
def tokenize(str): """str -> Sequence(Token)""" specs = [ ('Space', (r'[ \t\r\n]+', )), ('String', (r'"(%(unescaped)s | %(escaped)s)*"' % regexps, VERBOSE)), ('Number', (r''' -? # Minus (0|([1-9][0-9]*)) # Int (\.[0-9]+)? # Frac ([Ee][+-][0-9]+)? # Exp ''', VERBOSE)), ('Op', (r'[{}\[\]\-,:]', )), ('Name', (r'[A-Za-z_][A-Za-z_0-9]*', )), ] useless = ['Space'] t = make_tokenizer(specs) return [x for x in t(str) if x.type not in useless]
def tokenize(str): """str -> Sequence(Token)""" specs = [ ('Space', (r'[ \t\r\n]+',)), ('String', (r'"(%(unescaped)s | %(escaped)s)*"' % regexps, VERBOSE)), ('Number', (r''' -? # Minus (0|([1-9][0-9]*)) # Int (\.[0-9]+)? # Frac ([Ee][+-][0-9]+)? # Exp ''', VERBOSE)), ('Op', (r'[{}\[\]\-,:]',)), ('Name', (r'[A-Za-z_][A-Za-z_0-9]*',)), ] useless = ['Space'] t = make_tokenizer(specs) return [x for x in t(str) if x.type not in useless]
def tokenize(string): """str -> Sequence(Token)""" # Basic Tokens Spec specs = [ ('Comment', (r' *#.*', )), ('Space', (r'[ \t\r\n]+', )), ('USBCode', (r'U(("[^"]+")|(0x[0-9a-fA-F]+)|([0-9]+))', )), ('USBCodeStart', (r'U\[', )), ('ConsCode', (r'CONS(("[^"]+")|(0x[0-9a-fA-F]+)|([0-9]+))', )), ('ConsCodeStart', (r'CONS\[', )), ('SysCode', (r'SYS(("[^"]+")|(0x[0-9a-fA-F]+)|([0-9]+))', )), ('SysCodeStart', (r'SYS\[', )), ('LedCode', (r'LED(("[^"]+")|(0x[0-9a-fA-F]+)|([0-9]+))', )), ('LedCodeStart', (r'LED\[', )), ('ScanCode', (r'S((0x[0-9a-fA-F]+)|([0-9]+))', )), ('ScanCodeStart', (r'S\[', )), ('PixelCodeStart', (r'P\[.*', )), # Discarded, needs KLL 0.5 ('AnimationStart', (r'A\[.*', )), # Discarded, needs KLL 0.5 ('CodeStart', (r'\[', )), ('CodeEnd', (r'\]', )), ('String', (r'"[^"]*"', )), ('SequenceString', (r"'[^']*'", )), ('Position', (r'r?[xyz]:-?[0-9]+(.[0-9]+)?', )), ('Operator', (r'<=|=>|:\+|:-|::|:|=', )), ('Number', (r'(-[ \t]*)?((0x[0-9a-fA-F]+)|(0|([1-9][0-9]*)))', VERBOSE)), ('Comma', (r',', )), ('Dash', (r'-', )), ('Plus', (r'\+', )), ('Parenthesis', (r'\(|\)', )), ('None', (r'None', )), ('Name', (r'[A-Za-z_][A-Za-z_0-9]*', )), ('VariableContents', (r'''[^"' ;:=>()]+''', )), ('EndOfLine', (r';', )), ] # Tokens to filter out of the token stream useless = ['Space', 'Comment'] # Discarded expresssions (KLL 0.4+) useless.extend(['PixelCodeStart', 'AnimationStart']) tokens = make_tokenizer(specs) return [x for x in tokens(string) if x.type not in useless]
def test_error_info(self): tokenize = make_tokenizer([ ('keyword', (r'(is|end)',)), ('id', (r'[a-z]+',)), ('space', (r'[ \t]+',)), ('nl', (r'[\n\r]+',)), ]) try: list(tokenize('f is ф')) except LexerError as e: self.assertEqual(str(e), 'cannot tokenize data: 1,6: "f is \u0444"') else: self.fail('must raise LexerError') sometok = lambda type: some(lambda t: t.type == type) keyword = lambda s: a(Token('keyword', s)) id = sometok('id') is_ = keyword('is') end = keyword('end') nl = sometok('nl') equality = id + skip(is_) + id >> tuple expr = equality + skip(nl) file = many(expr) + end msg = """\ spam is eggs eggs isnt spam end""" toks = [x for x in tokenize(msg) if x.type != 'space'] try: file.parse(toks) except NoParseError as e: self.assertEqual(e.msg, "got unexpected token: 2,11-2,14: id 'spam'") self.assertEqual(e.state.pos, 4) self.assertEqual(e.state.max, 7) # May raise KeyError t = toks[e.state.max] self.assertEqual(t, Token('id', 'spam')) self.assertEqual((t.start, t.end), ((2, 11), (2, 14))) else: self.fail('must raise NoParseError')
def test_error_info(self): tokenize = make_tokenizer([ ('keyword', (r'(is|end)',)), ('id', (r'[a-z]+',)), ('space', (r'[ \t]+',)), ('nl', (r'[\n\r]+',)), ]) try: list(tokenize('f is ф')) except LexerError as e: self.assertEqual(six.text_type(e), 'cannot tokenize data: 1,6: "f is \u0444"') else: self.fail('must raise LexerError') sometok = lambda type: some(lambda t: t.type == type) keyword = lambda s: a(Token('keyword', s)) id = sometok('id') is_ = keyword('is') end = keyword('end') nl = sometok('nl') equality = id + skip(is_) + id >> tuple expr = equality + skip(nl) file = many(expr) + end msg = """\ spam is eggs eggs isnt spam end""" toks = [x for x in tokenize(msg) if x.type != 'space'] try: file.parse(toks) except NoParseError as e: self.assertEqual(e.msg, "got unexpected token: 2,11-2,14: id 'spam'") self.assertEqual(e.state.pos, 4) self.assertEqual(e.state.max, 7) # May raise KeyError t = toks[e.state.max] self.assertEqual(t, Token('id', 'spam')) self.assertEqual((t.start, t.end), ((2, 11), (2, 14))) else: self.fail('must raise NoParseError')
def tokenize(str): 'str -> Sequence(Token)' specs = [ ('comment', (r'//.*',)), ('newline', (r'[\r\n]+',)), ('space', (r'[ \t\r\n]+',)), ('number', (r'-?(\.[0-9]+)|([0-9]+(\.[0-9]*)?)',)), ('name', (r'([A-Za-z\200-\377_0-9]|!|\$|%|&|\*|\+|-|/|\?|=|\<|\>)+',)), ('kw_bind', (r'#bind',)), ('kw_halt', (r'#halt',)), ('op_lambda', (r'\\',)), ('op_map', (r'::',)), ('form_open', (r'\(',)), ('form_close', (r'\)',)) ] useless = ['comment', 'space', 'newline'] t = make_tokenizer(specs) return [x for x in t(str) if x.type not in useless]
def tokenize(s): tokenizer = make_tokenizer([ ('comment', (r'#.*\n', )), ('newline', (r'[\r\n]+', )), ('space', (r'[ \t\v]+', )), ('operator', (r'\->|not|and|or|is|as[\?!]|===|>>=?|<<=?|[&\^\|\+\-\*\/%~><!=]=?|' r'[\.:,\?@{}\[\]\(\)]', )), ('name', (r'[^\W\d][\w]*', )), ('number', (r'[-+]?(0|([1-9][0-9]*))(\.[0-9]+)?([Ee][+-]?[0-9]+)?', )), ('string', (r'\'[^\']*\'', )), ]) # Ignore whitespaces and comments. return [ token for token in tokenizer(s) if token.type not in ['space', 'newline', 'comment'] ]
def tokenize(str): 'str -> Sequence(Token)' specs = [ Spec('space', r'[ \t\r\n]+'), Spec('string', ur'"(%(unescaped)s | %(escaped)s)*"' % regexps, VERBOSE), Spec('number', r''' -? # Minus (0|([1-9][0-9]*)) # Int (\.[0-9]+)? # Frac ([Ee][+-][0-9]+)? # Exp ''', VERBOSE), Spec('op', r'[()\-\*\+/,]'), Spec('func', r'A0001|A0002|DECUMULATE|FUTURES|JOIN|SAME_TSRANGE_OF|SUM|TEST|TSERIES|WEIGHTED|FLOAT'), Spec('name', r'[A-Za-z_][A-Za-z_0-9]*'), ] useless = ['space'] t = make_tokenizer(specs) return [x for x in t(str) if x.type not in useless]
def tokenize( string ): """str -> Sequence(Token)""" # Basic Tokens Spec specs = [ ( 'Comment', ( r' *#.*', ) ), ( 'Space', ( r'[ \t\r\n]+', ) ), ( 'USBCode', ( r'U(("[^"]+")|(0x[0-9a-fA-F]+)|([0-9]+))', ) ), ( 'USBCodeStart', ( r'U\[', ) ), ( 'ConsCode', ( r'CONS(("[^"]+")|(0x[0-9a-fA-F]+)|([0-9]+))', ) ), ( 'ConsCodeStart', ( r'CONS\[', ) ), ( 'SysCode', ( r'SYS(("[^"]+")|(0x[0-9a-fA-F]+)|([0-9]+))', ) ), ( 'SysCodeStart', ( r'SYS\[', ) ), ( 'LedCode', ( r'LED(("[^"]+")|(0x[0-9a-fA-F]+)|([0-9]+))', ) ), ( 'LedCodeStart', ( r'LED\[', ) ), ( 'ScanCode', ( r'S((0x[0-9a-fA-F]+)|([0-9]+))', ) ), ( 'ScanCodeStart', ( r'S\[', ) ), ( 'PixelCodeStart', ( r'P\[.*', ) ), # Discarded, needs KLL 0.5 ( 'AnimationStart', ( r'A\[.*', ) ), # Discarded, needs KLL 0.5 ( 'CodeStart', ( r'\[', ) ), ( 'CodeEnd', ( r'\]', ) ), ( 'String', ( r'"[^"]*"', ) ), ( 'SequenceString', ( r"'[^']*'", ) ), ( 'Position', ( r'r?[xyz]:-?[0-9]+(.[0-9]+)?', ) ), ( 'Operator', ( r'<=|=>|:\+|:-|::|:|=', ) ), ( 'Number', ( r'(-[ \t]*)?((0x[0-9a-fA-F]+)|(0|([1-9][0-9]*)))', VERBOSE ) ), ( 'Comma', ( r',', ) ), ( 'Dash', ( r'-', ) ), ( 'Plus', ( r'\+', ) ), ( 'Parenthesis', ( r'\(|\)', ) ), ( 'None', ( r'None', ) ), ( 'Name', ( r'[A-Za-z_][A-Za-z_0-9]*', ) ), ( 'VariableContents', ( r'''[^"' ;:=>()]+''', ) ), ( 'EndOfLine', ( r';', ) ), ] # Tokens to filter out of the token stream useless = ['Space', 'Comment'] # Discarded expresssions (KLL 0.4+) useless.extend( ['PixelCodeStart', 'AnimationStart'] ) tokens = make_tokenizer( specs ) return [x for x in tokens( string ) if x.type not in useless]
def tokenize(string): 'unicode -> Sequence(Token)' specs = [ ('Comment', (r'#.*', )), ('NL', (r'[\r\n]+', )), ('Space', (r'[ ]+', )), ('JunkSpace', (r'[\t]+', )), ('Op', (r'[\[\]|:{}]', )), #('Regex', (r'<(\w|[-={}\[\]|().,^$+*?:\\])*>', re.UNICODE)), ('Regex', (r'<re>.*?</re>', re.UNICODE)), ('QuotedName', (r'"[^"\n]+"', re.UNICODE)), ('Name', (r'[^:<>\[\]{}| \n]+', re.UNICODE)), #('Name', (r"(\w[\u0300\u0301\u0302]?)+([-./](\w[\u0300\u0301\u0302]?)+)*['\u2019]?",re.UNICODE)), #('Name', (r'(\w[\u0300\u0301]?([-./](\w[\u0300\u0301]?)+)*|[-0-9][-0-9]*)',re.UNICODE)) ] useless = ['Comment', 'NL', 'JunkSpace'] tok = make_tokenizer(specs) #print("DEBUG TOKENIZER: ", [x for x in tok(string)]) return [x for x in tok(string) if x.type not in useless]
def tokenize(str): 'str -> Sequence(Token)' specs = [ Spec('space', r'[ \t\r\n]+'), Spec('string', ur'"(%(unescaped)s | %(escaped)s)*"' % regexps, VERBOSE), Spec( 'number', r''' -? # Minus (0|([1-9][0-9]*)) # Int (\.[0-9]+)? # Frac ([Ee][+-][0-9]+)? # Exp ''', VERBOSE), Spec('op', r'[{}\[\]\-,:]'), Spec('name', r'[A-Za-z_][A-Za-z_0-9]*'), ] useless = ['space'] t = make_tokenizer(specs) return [x for x in t(str) if x.type not in useless]
def tokenize(str): 'str -> Sequence(Token)' specs = [ Spec('space', r'[ \t\r\n]+'), Spec('string', ur'"(%(unescaped)s | %(escaped)s)*"' % regexps, VERBOSE), # NOTE: sometimes number gets into names place thus we shouldn't use them # TODO: consider removing or updating it # Spec('number', r''' # -? # Minus # (0|([1-9][0-9]*)) # Int # (\.[0-9]+)? # Frac # ([Ee][+-][0-9]+)? # Exp # \b''', VERBOSE), Spec('op', r'[{}\(\),;=]'), Spec('comment', r'/\*([^*]|[\r\n]|(\*+([^*/]|[\r\n])))*\*+/'), Spec('name', r'[/.A-Za-z_0-9]+'), ] useless = ['space'] t = make_tokenizer(specs) return [x for x in t(str) if x.type not in useless]
def test_error_info(): tokenize = make_tokenizer([ Spec('keyword', r'(is|end)'), Spec('id', r'[a-z]+'), Spec('space', r'[ \t]+'), Spec('nl', r'[\n\r]+'), ]) try: list(tokenize('f is ф')) except LexerError as e: pass else: ok_(False, 'must raise LexerError') keyword = lambda s: tok('keyword', s) id = tok('id') is_ = keyword('is') end = keyword('end') nl = tok('nl') equality = id + skip(is_) + id >> tuple expr = equality + skip(nl) file = many(expr) + end msg = """\ rake is eggs eggs isnt spam end""" toks = [x for x in tokenize(msg) if x.type != 'space'] try: file.parse(toks) except ParserError as e: msg, pos, i = e.args eq_(msg, "got unexpected token: id 'spam'") eq_(pos, ((2, 11), (2, 14))) # May raise KeyError t = toks[i] eq_(t, Token('id', 'spam')) else: ok_(False, 'must raise ParserError')
def tokenize(str): 'str -> Sequence(Token)' specs = [ (u'level', (r'^Level \d+.*$', MULTILINE)), (u'snes_monitor', (r'^\s+\d+ SNES Function norm.*$', MULTILINE)), (u'snes_converged', (r'^\s*Nonlinear solve converged due to \w+$', MULTILINE)), (u'snes_diverged', (r'^\s*Nonlinear solve did not converge due to \w+$', MULTILINE)), (u'ksp_monitor', (r'^\s+\d+ KSP Residual norm.*$', MULTILINE)), (u'ksp_converged', (r'^\s*Linear solve converged due to \w+$', MULTILINE)), (u'ksp_diverged', (r'^\s*Linear solve did not converge due to \w+$', MULTILINE)), (u'max_wall_time', (r'^Time \(sec\):\s+\d\.\d{3}e[-+]\d\d\s+\d\.\d{5}\s+\d\.\d{3}e[-+]\d\d$', MULTILINE)), (u'event', (r'^\S{1,16}\s+\d+ \d\.\d \d\.\d{4}e[-+]\d\d \d\.\d \d\.\d\de[-+]\d\d \d\.\d (\d\.\de[-+]\d\d ){2}.*$', MULTILINE)), (u'stage', (r'^--- Event Stage \d+: .*$', MULTILINE)), (u'memory_usage', (r'^Memory usage is given in bytes:', MULTILINE)), (u'summary_begin', (r'^---------------------------------------------- PETSc Performance Summary: ----------------------------------------------$', MULTILINE)), (u'hostline', (r'^\S+ on a \S+ named \S+ with \d+ processors?, by .*$', MULTILINE)), (u'option_table_begin', (r'^#PETSc Option Table entries:$', MULTILINE)), (u'option_table_entry', (r'^-\w+(\s+\w+)?$', MULTILINE)), (u'option_table_end', (r'^#End of? PETSc Option Table entries$', MULTILINE)), (u'nl', (r'[\r\n]+', )), (u'other', (r'^.*$', MULTILINE)), # Catches all lines that we don't understand ] ignored = 'nl other'.split() t = make_tokenizer(specs) return [x for x in t(str) if x.type not in ignored]
def lex_generic_query(query): """Lex a query string. Used by bibpy's accompanying tools. """ tokenizer = lexer.make_tokenizer([ ('not', [r'\^']), ('equals', [r'=']), ('approx', [r'~']), ('le', [r'<=']), ('lt', [r'<']), ('ge', [r'>=']), ('gt', [r'>']), ('comma', [r',']), ('dash', [r'-']), ('number', [r'-?(0|([1-9][0-9]*))']), ('name', [r'\w+']), ('space', [r'[ \t\r\n]+']), ('any', [r'[^<><=>=\s=\^~]+']) ]) return remove_whitespace_tokens(tokenizer(query))