def extract_shebang(cls, data): """ Internal: Extract normalized shebang command token. Examples extract_shebang("#!/usr/bin/ruby") # => "ruby" extract_shebang("#!/usr/bin/env node") # => "node" Returns String token or nil it couldn't be parsed. """ s = StringScanner(data) path = s.scan(r'^#!\s*\S+') if path: script = path.split('/')[-1] if script == 'env': s.scan(r'\s+') script = s.scan(r'\S+') if script: script = re.compile(r'[^\d]+').match(script).group(0) return script return
def extract_sgml_tokens(self, data): """ Internal: Extract tokens from inside SGML tag. data - SGML tag String. Examples extract_sgml_tokens("<a href='' class=foo>") # => ["<a>", "href="] Returns Array of token Strings. """ s = StringScanner(data) tokens = [] while not s.is_eos: # Emit start token token = s.scan(r'<\/?[^\s>]+') if token: tokens.append(token + '>') continue # Emit attributes with trailing = token = s.scan(r'\w+=') if token: tokens.append(token) # Then skip over attribute value if s.scan('"'): s.skip_until(r'[^\\]"') continue if s.scan("'"): s.skip_until(r"[^\\]'") continue s.skip_until(r'\w+') continue # Emit lone attributes token = s.scan(r'\w+') if token: tokens.append(token) # Stop at the end of the tag if s.scan('>'): s.terminate continue s.getch return tokens
def extract_tokens(self, data): """ Internal: Extract generic tokens from data. data - String to scan. Examples extract_tokens("printf('Hello')") # => ['printf', '(', ')'] Returns Array of token Strings. """ s = StringScanner(data) tokens = [] while not s.is_eos: if s.pos >= BYTE_LIMIT: break token = s.scan(r'^#!.+') if token: name = self.extract_shebang(token) if name: tokens.append('SHEBANG#!%s' % name) continue # Single line comment if s.is_beginning_of_line and s.scan(START_SINGLE_LINE_COMMENT): s.skip_until(r'\n|\Z') continue # Multiline comments token = s.scan(START_MULTI_LINE_COMMENT) if token: close_token = dict(MULTI_LINE_COMMENTS).get(token) s.skip_until(re.compile(re.escape(close_token))) continue # Skip single or double quoted strings if s.scan(r'"'): if s.peek(1) == '"': s.getch else: s.skip_until(r'[^\\]"') if s.scan(r"'"): if s.peek(1) == "'": s.getch else: s.skip_until(r"[^\\]'") # Skip number literals if s.scan(r'(0x)?\d(\d|\.)*'): continue # SGML style brackets token = s.scan(r'<[^\s<>][^<>]*>') if token: for t in self.extract_sgml_tokens(token): tokens.append(t) continue # Common programming punctuation token = s.scan(r';|\{|\}|\(|\)|\[|\]') if token: tokens.append(token) continue # Regular token token = s.scan(r'[\w\.@#\/\*]+') if token: tokens.append(token) continue # Common operators token = s.scan(r'<<?|\+|\-|\*|\/|%|&&?|\|\|?') if token: tokens.append(token) continue s.getch return tokens
def __init__(self, lexicon, str_): self.lexicon = lexicon self.scanner = StringScanner(str_)
def scanner(): return StringScanner("bar foobar")