def test_empty_list_on_empty_data(self): """Test Tokenizer works on empty list""" tokenizer = AntlrTokenizer(JavaScriptLexer) data = "" tokens = tokenizer.tokenize(data) self.assertEqual(tokens, [])
def test_tokenizes_malformed_without_error(self): """Tests tokenizer doesnt error on garbage input""" tokenizer = AntlrTokenizer(JavaScriptLexer) txt = "aasdfj1 1jhsdf9 1 3@ 1 + => adj 193" tokens = tokenizer.tokenize(txt) self.assertEqual(tokens, [ 'aasdfj1', ' ', '1', 'jhsdf9', ' ', '1', ' ', '3', '@', ' ', '1', ' ', '+', ' ', '=>', ' ', 'adj', ' ', '193' ])
def test_combine_same_as_orig(self): """Tests the token combiner""" tokenizer = AntlrTokenizer(JavaScriptLexer) txt = """async function process(array) { for await (let i of array) { doSomething(i); } }""" tokens = tokenizer.tokenize(txt) self.assertEqual(tokenizer.combine(tokens), txt)
def run(data, thread_count=minimizer.DEFAULT_THREAD_COUNT, file_extension=".js"): """Attempt to minimize a javascript test case.""" line_minimizer = delta_minimizer.DeltaMinimizer( utils.test, max_threads=thread_count, file_extension=file_extension) js_tokenizer = AntlrTokenizer(JavaScriptLexer) js_minimizer = JSMinimizer( utils.test, max_threads=thread_count, tokenizer=js_tokenizer.tokenize, token_combiner=js_tokenizer.combine, file_extension=file_extension, ) result = line_minimizer.minimize(data) result = js_minimizer(result) result = js_minimizer(result) result = line_minimizer.minimize(result) return result
def test_tokenize_simple_js_file(self): """Test tokenizer works with sample JS""" tokenizer = AntlrTokenizer(JavaScriptLexer) txt = """async function process(array) { for await (let i of array) { doSomething(i); } }""" tokens = tokenizer.tokenize(txt) self.assertEqual(tokens, [ 'async', ' ', 'function', ' ', 'process', '(', 'array', ')', ' ', '{', '\n', ' ', 'for', ' ', 'await', ' ', '(', 'let', ' ', 'i', ' ', 'of', ' ', 'array', ')', ' ', '{', '\n', ' ', 'doSomething', '(', 'i', ')', ';', '\n', ' ', '}', '\n', ' ', '}' ])
class HTMLMinimizer(minimizer.Minimizer): # pylint:disable=abstract-method """Specialized HTML minimizer. Note that this will not work properly with normal tokenizers. It simply acts as a wrapper around other minimizers and passes pieces of the HTML file to those.""" class Token(object): """Helper class to represent a single token.""" TYPE_HTML = 0 TYPE_SCRIPT = 1 def __init__(self, data, token_type): self.data = data self.token_type = token_type def __str__(self): return self.data class TokenizerState(object): """Enum for tokenizer states.""" SEARCHING_FOR_SCRIPT = 0 SEARCHING_FOR_TAG_END = 1 SEARCHING_FOR_CLOSE_SCRIPT = 2 HTMLTOKENIZER = AntlrTokenizer(HTMLLexer).tokenize JSTOKENIZER = AntlrTokenizer(JavaScriptLexer).tokenize TOKENIZER_MAP = { Token.TYPE_HTML: [HTMLTOKENIZER, HTMLTOKENIZER, HTMLTOKENIZER], Token.TYPE_SCRIPT: [JSTOKENIZER, JSTOKENIZER], } CHUNK_SIZES = [ [400, 100, 20, 5], [400, 100, 20, 5, 2], [400, 100, 20, 5, 1], ] def __init__(self, test_function, *args, **kwargs): # The HTML minimizer will not be used directly. Instead, preserve its # arguments and pass them along when creating subminimizers. super(HTMLMinimizer, self).__init__(lambda: False) assert not args, 'Positional arguments not supported.' assert 'tokenizer' not in kwargs, 'Custom tokenizers not supported.' assert 'token_combiner' not in kwargs, 'Custom tokenizers not supported.' self.test_function = test_function self.kwargs = kwargs def minimize(self, data): """Wrapper to perform common tasks and call |_execute|.""" # Do an initial line-by-line minimization to filter out noise. line_minimizer = delta_minimizer.DeltaMinimizer( self.test_function, **self.kwargs) # Do two line minimizations to make up for the fact that minimzations on # bots don't always minimize as much as they can. for _ in range(2): data = line_minimizer.minimize(data) tokens = self.get_tokens_and_metadata(data) for index, token in enumerate(tokens): current_tokenizers = self.TOKENIZER_MAP[token.token_type] prefix = self.combine_tokens(tokens[:index]) suffix = self.combine_tokens(tokens[index + 1:]) token_combiner = functools.partial(self.combine_worker_tokens, prefix=prefix, suffix=suffix) for level, current_tokenizer in enumerate(current_tokenizers): # We need to preserve the parts of the test case that are not currently # being minimized. Create a special token combiner that adds these # portions of the test to the combined tokens. if token.token_type == HTMLMinimizer.Token.TYPE_HTML: current_minimizer = chunk_minimizer.ChunkMinimizer( self.test_function, chunk_sizes=HTMLMinimizer.CHUNK_SIZES[level], token_combiner=token_combiner, tokenizer=current_tokenizer, **self.kwargs) else: current_minimizer = js_minimizer.JSMinimizer( self.test_function, token_combiner=token_combiner, tokenizer=current_tokenizer, **self.kwargs) result_data = current_minimizer.minimize(token.data) start = len(prefix) end = len(result_data) - len(suffix) token.data = result_data[start:end] # TODO(mbarbella): Remove this once other minimizers are improved. # Do a final line-by-line minimization pass. data = self.combine_tokens(tokens) return line_minimizer.minimize(data) @staticmethod def get_tokens_and_metadata(data): """Get the token list with associated metadata.""" tokens = [] state = HTMLMinimizer.TokenizerState.SEARCHING_FOR_SCRIPT current_token_start = 0 current_token_type = HTMLMinimizer.Token.TYPE_HTML index = 0 while 0 <= index < len(data): if state == HTMLMinimizer.TokenizerState.SEARCHING_FOR_SCRIPT: # In this case, we are simply searching for the next script tag. index = data.find(SCRIPT_START_STRING, index) state = HTMLMinimizer.TokenizerState.SEARCHING_FOR_TAG_END elif state == HTMLMinimizer.TokenizerState.SEARCHING_FOR_TAG_END: # Make sure that this really looks like a script tag. next_newline = data.find('\n', index) tag_end = data.find('>', index) if 0 <= tag_end < next_newline or next_newline < 0 <= tag_end: # The end of the script tag is before the next newline, so it should # be safe to attempt to split this. index = tag_end + 1 token = HTMLMinimizer.Token( data[current_token_start:index], current_token_type) tokens.append(token) # Update state. current_token_type = HTMLMinimizer.Token.TYPE_SCRIPT current_token_start = index state = HTMLMinimizer.TokenizerState.SEARCHING_FOR_CLOSE_SCRIPT else: # We found a newline before the end of tag or did not find the end # of the tag, so something seems wrong. Skip this one. index += len(SCRIPT_START_STRING) elif state == HTMLMinimizer.TokenizerState.SEARCHING_FOR_CLOSE_SCRIPT: # Simply look for the end of this script. index = data.find(SCRIPT_END_STRING, index) if index < 0: break # TODO(mbarbella): Optimize for empty script case (e.g. for "src="). token = HTMLMinimizer.Token(data[current_token_start:index], current_token_type) tokens.append(token) current_token_start = index current_token_type = HTMLMinimizer.Token.TYPE_HTML state = HTMLMinimizer.TokenizerState.SEARCHING_FOR_SCRIPT token = HTMLMinimizer.Token(data[current_token_start:], current_token_type) tokens.append(token) return tokens @staticmethod def combine_worker_tokens(tokens, prefix='', suffix=''): """Combine tokens for a worker minimizer.""" return '%s%s%s' % (prefix, ''.join(tokens), suffix) @staticmethod def combine_tokens(tokens): """Combine tokens into a usable format, stripping metadata.""" return ''.join([str(t) for t in tokens]) @staticmethod def run(data, thread_count=minimizer.DEFAULT_THREAD_COUNT, file_extension='.html'): """Attempt to minimize an html test case.""" html_minimizer = HTMLMinimizer(utils.test, max_threads=thread_count, file_extension=file_extension) return html_minimizer.minimize(data)
def tokenize(data): """HTML tokenizer.""" return AntlrTokenizer(HTMLLexer).tokenize(data)