def tokenize(options): try: filename = options.filename code = open(filename, encoding='utf-8').read() stream = StringStream(code) parser = LycParser() if 'output' in options: output = options.output encoder = options.encoder filler_token_value = Tokens.WHITESPACE.value if options.binary else Tokens.WHITESPACE.name for token, first_index, index_after in parser.tokenize_with_intervals( stream): if token is None: bytes_ = encoder( (filler_token_value, first_index, index_after)) else: token_value = token.type.value if options.binary else token.type.name bytes_ = encoder((token_value, first_index, index_after)) output.write(bytes_) else: for token in parser.tokenize(stream): print(str(token)) except CompilerError as e: print(e.trace)
def tokenize(message: list) -> list: time_ = time.time() if not 3 <= len(message) <= 4: return error( "Tokenization request format is:\n input: ['tokenize', file_name:str, file_contents:str, binary=False]\n output: ['tokenize', token_ranges:list(list(token_code, first_index, index_after))]" ) file_name = message[1] file_contents = message[2] if not isinstance(file_name, str): return error( 'Tokenization request: "file_name" arg must be a string.') if not isinstance(file_contents, str): return error( 'Tokenization request: "file_contents" arg must be a string.') if VERBOSE: print("\tfile-name: " + file_name) print("\tfile-contents: " + (repr(file_contents) if len(file_contents) < 80 else repr(file_contents[0:80]) + " ...")) if len(message) == 4: binary = message[3] if not isinstance(file_contents, bool): return error( 'Tokenization request: "binary" arg must be a string.') else: binary = True stream = StringStream(file_contents, name=file_name) parser = AnokyParser() token_ranges = [] current_index = 0 try: for token in parser.tokenize(stream, emmit_restart_tokens=True): token_first = token.range.first_position.index token_after = token.range.position_after.index # if token_first > current_index: # token_type = Tokens._TokenTypes.WHITESPACE.value if binary else Tokens._TokenTypes.WHITESPACE.name # token_ranges.append([token_type, current_index, token_first]) # current_index = token_first # el if token_first < current_index: raise Exception( token_first, "Overlapping tokens (%s, %s), something is wrong with the tokenizer!!!" % (current_index, token_first)) token_type = token.type.value if binary else token.type.name token_ranges.append([token_type, current_index, token_after]) current_index = token_after except TokenizingError as e: return error(e) if len(token_ranges) > 0: last_token_range = token_ranges[-1] if last_token_range[2] < len(file_contents): last_token_range[2] += 1 print("Tokenization took %s seconds" % (time_ - time.time())) return pack(['tokenize', token_ranges])
def _get_stream(self, code_or_stream: Union[str, CharacterStream]): if isinstance(code_or_stream, str): stream = StringStream(code_or_stream) else: assert isinstance(code_or_stream, CharacterStream) stream = code_or_stream return stream
def interactive_anoky(options): options.filename = '<interactive>' sys.path = [''] + sys.path (CG, init_code) = code_generator.begin(interactive=True, special_forms=__special_forms__, macros=__macros__, id_macros=__id_macros__) interactive_history = InMemoryHistory() try: while True: written_code = prompt('>>> ', history=interactive_history, multiline=True) stream = StringStream(written_code, '<interactive>') try: node = anoky_tokenize(stream, options) if not options.arrange_tokens: continue anoky_transduce(node, options) if not options.expand_macros: continue anoky_expand(node, options) if not options.generate_code: continue py_ast = anoky_generate(node, options, CG) py_ast = code_generator.end(py_ast, CG) if options.print_python_ast: print_ast(py_ast) if options.print_python_code: print_python_code(py_ast) except CompilerError as e: print(e.trace) except Exception: print( '\n!—›– Compiler raised unhandled exception (this is not supposed to happen)!!! –‹—!' ) traceback.print_exc() else: ast.fix_missing_locations(py_ast) try: compiled_ast = compile(py_ast, filename='<interactive>', mode='single') except Exception: print('\n——›– AST compilation failed !!! –‹——') traceback.print_exc() print_ast(py_ast) print_python_code(py_ast) else: if options.execute: try: exec(compiled_ast) except Exception as e: traceback.print_exc() except EOFError: return except KeyboardInterrupt: return
async def async_tokenize(id, incomming, outgoing): def my_send_message(msg): if VERBOSE: print("\treply: " + str(msg)) return outgoing.push_message(pack(msg)) def my_error(e): nonlocal outgoing if VERBOSE: print("\terror: " + str(e)) return outgoing.push_message(error(e)) # first message (see below for syntax) # It will give us the filename name and contents of the written code, # and also whether we should mark the first offset as being anything other than zero, # and the indentation level at which the code is written message = await incomming() if not 3 <= len(message) <= 5: return outgoing.push_message( error( "Async tokenization request format is:\n" " first message: ['async_tokenize', file_name:str, file_contents:str, first_offset:int = 0, indentation_level:int = 0]\n" " first reply: ['async_tokenize', handler_id:int]\n" " following messages: ['async_tokenize_next', handler_id:int]\n" " reply: ['async_tokenize_next', token_code, first_index, index_after]\n" " ending_message: ['close', handler_id:int]\n" " reply: ['close']" "at any moment, reply may be:" " ['async_tokenize_error', message:str, first_position?:int, position_after?:int]" )) file_name = message[1] file_contents = message[2] if not isinstance(file_name, str): return my_error( 'Async tokenization request: "file_name" arg must be a string.' ) if not isinstance(file_contents, str): return my_error( 'Async tokenization request: "file_contents" arg must be a string.' ) if VERBOSE: print("\tfile-name: " + file_name) print("\tfile-contents: " + (repr(file_contents) if len(file_contents) < 80 else repr(file_contents[0:80]) + " ...")) if len(message) >= 4: print("\toffset: %s " % message[3]) if len(message) >= 5: print("\tindentation: %s" % message[4]) # Get global offset of first character, if any if len(message) >= 4: shift = message[3] if not isinstance(shift, int): return my_error( 'Tokenization request: "first_offset" arg must be an integer.' ) else: shift = 0 # get indentation level of code, if any if len(message) >= 5: indentation_level = message[4] if not isinstance(indentation_level, int): return my_error( 'Tokenization request: "indentation_level" arg must be an integer.' ) else: indentation_level = 0 # reply with the id of this async tokenization handler my_send_message(['async_tokenize', id]) # Now the tokenization actually begins # We will tokenize each token, and between tokens we wait for the request of the next token. # First we prepare the stream, with the right shift and indentation level stream = StringStream(file_contents, name=file_name) if indentation_level > 0: stream = IndentedCharacterStream(stream) stream.readn(indentation_level) stream.push() # Then we tokenize the given text, parser = AnokyParser() current_index = indentation_level try: for token in parser.tokenize(stream, emmit_restart_tokens=True): token_first = token.range.first_position.index token_after = token.range.position_after.index # if token_first > current_index: # token_type = Tokens._TokenTypes.WHITESPACE.value # # We wait for the next token request, and emit a whitespace filler to the outgoing socket # message = await incomming() # if VERBOSE: print("\tmessage: %s" % message) # assert len(message) >= 2 and message[1] == id # if message[0] == 'close': # my_send_message(['close']) # return # elif message[0] == 'async_tokenize_next': # my_send_message(['async_tokenize_next', token_type, current_index+shift, token_first+shift]) # else: # return my_error("Unkown message for async_tokenize handler, '%s'." % message[0]) # current_index = token_first # el if token_first < current_index: raise Exception( token_first, "Overlapping tokens (%s, %s), something is wrong with the tokenizer!!!" % (current_index + shift, token_first + shift)) token_type = token.type.value # Now that we know the next token type, we wait for the next token request, # and emit it to the outgoing socket message = await incomming() if VERBOSE: print("\tmessage: " + str(message)) assert len(message) >= 2 and message[1] == id if message[0] == 'close': my_send_message(['close']) return elif message[0] == 'async_tokenize_next': my_send_message([ 'async_tokenize_next', token_type, current_index + shift, token_after + shift ]) else: return my_error( "Unkown message for async_tokenize handler, '%s'." % message[0]) current_index = token_after except TokenizingError as e: return my_error(e) while True: message = await incomming() if VERBOSE: print("\tmessage: %s" % message) assert len(message) >= 2 and message[1] == id if message[0] == 'close': my_send_message(['close']) return elif message[0] == 'async_tokenize_next': my_send_message(['async_tokenize_next', -1, -1, -1]) else: return my_error( "Unkown message for async_tokenize handler, '%s'." % message[0]) return
def colorize(message: list) -> list: if not 3 <= len(message) <= 4: return error( "Colorization request format is:\n input: ['colorize', file_name:str, file_contents:str, binary=False]\n output: ['colorize', token_ranges:list(list(color_code, first_index, index_after))]" ) file_name = message[1] file_contents = message[2] if not isinstance(file_name, str): return error( 'Colorization request: "file_name" arg must be a string.') if not isinstance(file_contents, str): return error( 'Colorization request: "file_contents" arg must be a string.') if VERBOSE: print("\tfile-name: " + file_name) print("\tfile-contents: " + (repr(file_contents) if len(file_contents) < 80 else repr(file_contents[0:80]) + " ...")) if len(message) == 4: binary = message[3] if not isinstance(file_contents, bool): return error( 'Colorization request: "binary" arg must be a string.') else: binary = True stream = StringStream(file_contents, name=file_name) parser = AnokyParser() code_expander = DefaultExpander() code_generator = DefaultGenerator() try: node = parser.parse(stream) code_expander.expand_unit(node) code_generator.generate_unit(node) colorized_tokens = [] def extract_colorized_tokens(element): nonlocal colorized_tokens if element.color is not None and is_not_none( element, ".range.first_position.index") and is_not_none( element, ".range.position_after.index"): token_color = element.color token_first = element.range.first_position.index token_after = element.range.position_after.index if not isinstance(token_color, int): return error( 'Colorization request: color of token "%s" was not int!' % element.text) colorized_tokens.append( [token_color, token_first, token_after]) if isinstance(element.code, Node): for subelement in element.code: extract_colorized_tokens(subelement) for element in node: extract_colorized_tokens(element) except CompilerError as e: return error(e) return pack(['colorize', colorized_tokens])
async def async_tokenize(id, incomming, outgoing): def my_send_message(msg): if VERBOSE: print("\treply: " + str(msg)) return outgoing.push_message(pack(msg)) def my_error(e): nonlocal outgoing if VERBOSE: print("\terror: " + str(e)) return outgoing.push_message(error(e)) # first message (see below for syntax) # It will give us the filename name and contents of the written code, # and also whether we should mark the first offset as being anything other than zero, # and the indentation level at which the code is written message = await incomming() if not 3 <= len(message) <= 5: return outgoing.push_message(error( "Async tokenization request format is:\n" " first message: ['async_tokenize', file_name:str, file_contents:str, first_offset:int = 0, indentation_level:int = 0]\n" " first reply: ['async_tokenize', handler_id:int]\n" " following messages: ['async_tokenize_next', handler_id:int]\n" " reply: ['async_tokenize_next', token_code, first_index, index_after]\n" " ending_message: ['close', handler_id:int]\n" " reply: ['close']" "at any moment, reply may be:" " ['async_tokenize_error', message:str, first_position?:int, position_after?:int]")) file_name = message[1] file_contents = message[2] if not isinstance(file_name, str): return my_error('Async tokenization request: "file_name" arg must be a string.') if not isinstance(file_contents, str): return my_error('Async tokenization request: "file_contents" arg must be a string.') if VERBOSE: print("\tfile-name: " + file_name) print("\tfile-contents: " + ( repr(file_contents) if len(file_contents) < 80 else repr(file_contents[0:80]) + " ...")) if len(message) >= 4: print("\toffset: %s " % message[3]) if len(message) >= 5: print("\tindentation: %s" % message[4]) # Get global offset of first character, if any if len(message) >= 4: shift = message[3] if not isinstance(shift, int): return my_error('Tokenization request: "first_offset" arg must be an integer.') else: shift = 0 # get indentation level of code, if any if len(message) >= 5: indentation_level = message[4] if not isinstance(indentation_level, int): return my_error('Tokenization request: "indentation_level" arg must be an integer.') else: indentation_level = 0 # reply with the id of this async tokenization handler my_send_message(['async_tokenize', id]) # Now the tokenization actually begins # We will tokenize each token, and between tokens we wait for the request of the next token. # First we prepare the stream, with the right shift and indentation level stream = StringStream(file_contents, name=file_name) if indentation_level > 0: stream = IndentedCharacterStream(stream) stream.readn(indentation_level) stream.push() # Then we tokenize the given text, parser = AnokyParser() current_index = indentation_level try: for token in parser.tokenize(stream, emmit_restart_tokens=True): token_first = token.range.first_position.index token_after = token.range.position_after.index # if token_first > current_index: # token_type = Tokens._TokenTypes.WHITESPACE.value # # We wait for the next token request, and emit a whitespace filler to the outgoing socket # message = await incomming() # if VERBOSE: print("\tmessage: %s" % message) # assert len(message) >= 2 and message[1] == id # if message[0] == 'close': # my_send_message(['close']) # return # elif message[0] == 'async_tokenize_next': # my_send_message(['async_tokenize_next', token_type, current_index+shift, token_first+shift]) # else: # return my_error("Unkown message for async_tokenize handler, '%s'." % message[0]) # current_index = token_first # el if token_first < current_index: raise Exception(token_first, "Overlapping tokens (%s, %s), something is wrong with the tokenizer!!!" % ( current_index+shift, token_first+shift)) token_type = token.type.value # Now that we know the next token type, we wait for the next token request, # and emit it to the outgoing socket message = await incomming() if VERBOSE: print("\tmessage: " + str(message)) assert len(message) >= 2 and message[1] == id if message[0] == 'close': my_send_message(['close']) return elif message[0] == 'async_tokenize_next': my_send_message(['async_tokenize_next', token_type, current_index+shift, token_after+shift]) else: return my_error("Unkown message for async_tokenize handler, '%s'." % message[0]) current_index = token_after except TokenizingError as e: return my_error(e) while True: message = await incomming() if VERBOSE: print("\tmessage: %s" % message) assert len(message) >= 2 and message[1] == id if message[0] == 'close': my_send_message(['close']) return elif message[0] == 'async_tokenize_next': my_send_message(['async_tokenize_next', -1, -1, -1]) else: return my_error("Unkown message for async_tokenize handler, '%s'." % message[0]) return
def __init__(self, filepath, encoding='utf-8'): with open(filepath, 'r', encoding=encoding) as content_file: content = content_file.read() StringStream.__init__(self, content, filepath)