def tokenize(message: list) -> list: time_ = time.time() if not 3 <= len(message) <= 4: return error( "Tokenization request format is:\n input: ['tokenize', file_name:str, file_contents:str, binary=False]\n output: ['tokenize', token_ranges:list(list(token_code, first_index, index_after))]" ) file_name = message[1] file_contents = message[2] if not isinstance(file_name, str): return error( 'Tokenization request: "file_name" arg must be a string.') if not isinstance(file_contents, str): return error( 'Tokenization request: "file_contents" arg must be a string.') if VERBOSE: print("\tfile-name: " + file_name) print("\tfile-contents: " + (repr(file_contents) if len(file_contents) < 80 else repr(file_contents[0:80]) + " ...")) if len(message) == 4: binary = message[3] if not isinstance(file_contents, bool): return error( 'Tokenization request: "binary" arg must be a string.') else: binary = True stream = StringStream(file_contents, name=file_name) parser = AnokyParser() token_ranges = [] current_index = 0 try: for token in parser.tokenize(stream, emmit_restart_tokens=True): token_first = token.range.first_position.index token_after = token.range.position_after.index # if token_first > current_index: # token_type = Tokens._TokenTypes.WHITESPACE.value if binary else Tokens._TokenTypes.WHITESPACE.name # token_ranges.append([token_type, current_index, token_first]) # current_index = token_first # el if token_first < current_index: raise Exception( token_first, "Overlapping tokens (%s, %s), something is wrong with the tokenizer!!!" % (current_index, token_first)) token_type = token.type.value if binary else token.type.name token_ranges.append([token_type, current_index, token_after]) current_index = token_after except TokenizingError as e: return error(e) if len(token_ranges) > 0: last_token_range = token_ranges[-1] if last_token_range[2] < len(file_contents): last_token_range[2] += 1 print("Tokenization took %s seconds" % (time_ - time.time())) return pack(['tokenize', token_ranges])
def tokenize(options): try: filename = options.filename code = open(filename, encoding='utf-8').read() stream = StringStream(code) parser = AnokyParser() if 'output' in options: output = options.output encoder = options.encoder filler_token_value = Tokens.WHITESPACE.value if options.binary else Tokens.WHITESPACE.name for token, first_index, index_after in parser.tokenize_with_intervals(stream): if token is None: bytes_ = encoder((filler_token_value, first_index, index_after)) else: token_value = token.type.value if options.binary else token.type.name bytes_ = encoder((token_value, first_index, index_after)) output.write(bytes_) else: for token in parser.tokenize(stream): print(str(token)) except CompilerError as e: print(e.trace)
def tokenize(message: list) -> list: time_ = time.time() if not 3 <= len(message) <= 4: return error( "Tokenization request format is:\n input: ['tokenize', file_name:str, file_contents:str, binary=False]\n output: ['tokenize', token_ranges:list(list(token_code, first_index, index_after))]") file_name = message[1] file_contents = message[2] if not isinstance(file_name, str): return error('Tokenization request: "file_name" arg must be a string.') if not isinstance(file_contents, str): return error('Tokenization request: "file_contents" arg must be a string.') if VERBOSE: print("\tfile-name: " + file_name) print("\tfile-contents: " + ( repr(file_contents) if len(file_contents) < 80 else repr(file_contents[0:80]) + " ...")) if len(message) == 4: binary = message[3] if not isinstance(file_contents, bool): return error('Tokenization request: "binary" arg must be a string.') else: binary = True stream = StringStream(file_contents, name=file_name) parser = AnokyParser() token_ranges = [] current_index = 0 try: for token in parser.tokenize(stream, emmit_restart_tokens=True): token_first = token.range.first_position.index token_after = token.range.position_after.index # if token_first > current_index: # token_type = Tokens._TokenTypes.WHITESPACE.value if binary else Tokens._TokenTypes.WHITESPACE.name # token_ranges.append([token_type, current_index, token_first]) # current_index = token_first # el if token_first < current_index: raise Exception(token_first, "Overlapping tokens (%s, %s), something is wrong with the tokenizer!!!" % ( current_index, token_first)) token_type = token.type.value if binary else token.type.name token_ranges.append([token_type, current_index, token_after]) current_index = token_after except TokenizingError as e: return error(e) if len(token_ranges) > 0: last_token_range = token_ranges[-1] if last_token_range[2] < len(file_contents): last_token_range[2] += 1 print("Tokenization took %s seconds" % (time_ - time.time())) return pack(['tokenize', token_ranges])
async def async_tokenize(id, incomming, outgoing): def my_send_message(msg): if VERBOSE: print("\treply: " + str(msg)) return outgoing.push_message(pack(msg)) def my_error(e): nonlocal outgoing if VERBOSE: print("\terror: " + str(e)) return outgoing.push_message(error(e)) # first message (see below for syntax) # It will give us the filename name and contents of the written code, # and also whether we should mark the first offset as being anything other than zero, # and the indentation level at which the code is written message = await incomming() if not 3 <= len(message) <= 5: return outgoing.push_message( error( "Async tokenization request format is:\n" " first message: ['async_tokenize', file_name:str, file_contents:str, first_offset:int = 0, indentation_level:int = 0]\n" " first reply: ['async_tokenize', handler_id:int]\n" " following messages: ['async_tokenize_next', handler_id:int]\n" " reply: ['async_tokenize_next', token_code, first_index, index_after]\n" " ending_message: ['close', handler_id:int]\n" " reply: ['close']" "at any moment, reply may be:" " ['async_tokenize_error', message:str, first_position?:int, position_after?:int]" )) file_name = message[1] file_contents = message[2] if not isinstance(file_name, str): return my_error( 'Async tokenization request: "file_name" arg must be a string.' ) if not isinstance(file_contents, str): return my_error( 'Async tokenization request: "file_contents" arg must be a string.' ) if VERBOSE: print("\tfile-name: " + file_name) print("\tfile-contents: " + (repr(file_contents) if len(file_contents) < 80 else repr(file_contents[0:80]) + " ...")) if len(message) >= 4: print("\toffset: %s " % message[3]) if len(message) >= 5: print("\tindentation: %s" % message[4]) # Get global offset of first character, if any if len(message) >= 4: shift = message[3] if not isinstance(shift, int): return my_error( 'Tokenization request: "first_offset" arg must be an integer.' ) else: shift = 0 # get indentation level of code, if any if len(message) >= 5: indentation_level = message[4] if not isinstance(indentation_level, int): return my_error( 'Tokenization request: "indentation_level" arg must be an integer.' ) else: indentation_level = 0 # reply with the id of this async tokenization handler my_send_message(['async_tokenize', id]) # Now the tokenization actually begins # We will tokenize each token, and between tokens we wait for the request of the next token. # First we prepare the stream, with the right shift and indentation level stream = StringStream(file_contents, name=file_name) if indentation_level > 0: stream = IndentedCharacterStream(stream) stream.readn(indentation_level) stream.push() # Then we tokenize the given text, parser = AnokyParser() current_index = indentation_level try: for token in parser.tokenize(stream, emmit_restart_tokens=True): token_first = token.range.first_position.index token_after = token.range.position_after.index # if token_first > current_index: # token_type = Tokens._TokenTypes.WHITESPACE.value # # We wait for the next token request, and emit a whitespace filler to the outgoing socket # message = await incomming() # if VERBOSE: print("\tmessage: %s" % message) # assert len(message) >= 2 and message[1] == id # if message[0] == 'close': # my_send_message(['close']) # return # elif message[0] == 'async_tokenize_next': # my_send_message(['async_tokenize_next', token_type, current_index+shift, token_first+shift]) # else: # return my_error("Unkown message for async_tokenize handler, '%s'." % message[0]) # current_index = token_first # el if token_first < current_index: raise Exception( token_first, "Overlapping tokens (%s, %s), something is wrong with the tokenizer!!!" % (current_index + shift, token_first + shift)) token_type = token.type.value # Now that we know the next token type, we wait for the next token request, # and emit it to the outgoing socket message = await incomming() if VERBOSE: print("\tmessage: " + str(message)) assert len(message) >= 2 and message[1] == id if message[0] == 'close': my_send_message(['close']) return elif message[0] == 'async_tokenize_next': my_send_message([ 'async_tokenize_next', token_type, current_index + shift, token_after + shift ]) else: return my_error( "Unkown message for async_tokenize handler, '%s'." % message[0]) current_index = token_after except TokenizingError as e: return my_error(e) while True: message = await incomming() if VERBOSE: print("\tmessage: %s" % message) assert len(message) >= 2 and message[1] == id if message[0] == 'close': my_send_message(['close']) return elif message[0] == 'async_tokenize_next': my_send_message(['async_tokenize_next', -1, -1, -1]) else: return my_error( "Unkown message for async_tokenize handler, '%s'." % message[0]) return
async def async_tokenize(id, incomming, outgoing): def my_send_message(msg): if VERBOSE: print("\treply: " + str(msg)) return outgoing.push_message(pack(msg)) def my_error(e): nonlocal outgoing if VERBOSE: print("\terror: " + str(e)) return outgoing.push_message(error(e)) # first message (see below for syntax) # It will give us the filename name and contents of the written code, # and also whether we should mark the first offset as being anything other than zero, # and the indentation level at which the code is written message = await incomming() if not 3 <= len(message) <= 5: return outgoing.push_message(error( "Async tokenization request format is:\n" " first message: ['async_tokenize', file_name:str, file_contents:str, first_offset:int = 0, indentation_level:int = 0]\n" " first reply: ['async_tokenize', handler_id:int]\n" " following messages: ['async_tokenize_next', handler_id:int]\n" " reply: ['async_tokenize_next', token_code, first_index, index_after]\n" " ending_message: ['close', handler_id:int]\n" " reply: ['close']" "at any moment, reply may be:" " ['async_tokenize_error', message:str, first_position?:int, position_after?:int]")) file_name = message[1] file_contents = message[2] if not isinstance(file_name, str): return my_error('Async tokenization request: "file_name" arg must be a string.') if not isinstance(file_contents, str): return my_error('Async tokenization request: "file_contents" arg must be a string.') if VERBOSE: print("\tfile-name: " + file_name) print("\tfile-contents: " + ( repr(file_contents) if len(file_contents) < 80 else repr(file_contents[0:80]) + " ...")) if len(message) >= 4: print("\toffset: %s " % message[3]) if len(message) >= 5: print("\tindentation: %s" % message[4]) # Get global offset of first character, if any if len(message) >= 4: shift = message[3] if not isinstance(shift, int): return my_error('Tokenization request: "first_offset" arg must be an integer.') else: shift = 0 # get indentation level of code, if any if len(message) >= 5: indentation_level = message[4] if not isinstance(indentation_level, int): return my_error('Tokenization request: "indentation_level" arg must be an integer.') else: indentation_level = 0 # reply with the id of this async tokenization handler my_send_message(['async_tokenize', id]) # Now the tokenization actually begins # We will tokenize each token, and between tokens we wait for the request of the next token. # First we prepare the stream, with the right shift and indentation level stream = StringStream(file_contents, name=file_name) if indentation_level > 0: stream = IndentedCharacterStream(stream) stream.readn(indentation_level) stream.push() # Then we tokenize the given text, parser = AnokyParser() current_index = indentation_level try: for token in parser.tokenize(stream, emmit_restart_tokens=True): token_first = token.range.first_position.index token_after = token.range.position_after.index # if token_first > current_index: # token_type = Tokens._TokenTypes.WHITESPACE.value # # We wait for the next token request, and emit a whitespace filler to the outgoing socket # message = await incomming() # if VERBOSE: print("\tmessage: %s" % message) # assert len(message) >= 2 and message[1] == id # if message[0] == 'close': # my_send_message(['close']) # return # elif message[0] == 'async_tokenize_next': # my_send_message(['async_tokenize_next', token_type, current_index+shift, token_first+shift]) # else: # return my_error("Unkown message for async_tokenize handler, '%s'." % message[0]) # current_index = token_first # el if token_first < current_index: raise Exception(token_first, "Overlapping tokens (%s, %s), something is wrong with the tokenizer!!!" % ( current_index+shift, token_first+shift)) token_type = token.type.value # Now that we know the next token type, we wait for the next token request, # and emit it to the outgoing socket message = await incomming() if VERBOSE: print("\tmessage: " + str(message)) assert len(message) >= 2 and message[1] == id if message[0] == 'close': my_send_message(['close']) return elif message[0] == 'async_tokenize_next': my_send_message(['async_tokenize_next', token_type, current_index+shift, token_after+shift]) else: return my_error("Unkown message for async_tokenize handler, '%s'." % message[0]) current_index = token_after except TokenizingError as e: return my_error(e) while True: message = await incomming() if VERBOSE: print("\tmessage: %s" % message) assert len(message) >= 2 and message[1] == id if message[0] == 'close': my_send_message(['close']) return elif message[0] == 'async_tokenize_next': my_send_message(['async_tokenize_next', -1, -1, -1]) else: return my_error("Unkown message for async_tokenize handler, '%s'." % message[0]) return