예제 #1
0
    def tokenize(message: list) -> list:
        time_ = time.time()
        if not 3 <= len(message) <= 4:
            return error(
                "Tokenization request format is:\n input: ['tokenize', file_name:str, file_contents:str, binary=False]\n output: ['tokenize', token_ranges:list(list(token_code, first_index, index_after))]"
            )
        file_name = message[1]
        file_contents = message[2]
        if not isinstance(file_name, str):
            return error(
                'Tokenization request: "file_name" arg must be a string.')
        if not isinstance(file_contents, str):
            return error(
                'Tokenization request: "file_contents" arg must be a string.')
        if VERBOSE:
            print("\tfile-name: " + file_name)
            print("\tfile-contents: " +
                  (repr(file_contents) if len(file_contents) < 80 else
                   repr(file_contents[0:80]) + " ..."))
        if len(message) == 4:
            binary = message[3]
            if not isinstance(file_contents, bool):
                return error(
                    'Tokenization request: "binary" arg must be a string.')
        else:
            binary = True

        stream = StringStream(file_contents, name=file_name)

        parser = AnokyParser()

        token_ranges = []
        current_index = 0
        try:
            for token in parser.tokenize(stream, emmit_restart_tokens=True):
                token_first = token.range.first_position.index
                token_after = token.range.position_after.index
                # if token_first > current_index:
                #     token_type = Tokens._TokenTypes.WHITESPACE.value if binary else Tokens._TokenTypes.WHITESPACE.name
                #     token_ranges.append([token_type, current_index, token_first])
                #     current_index = token_first
                # el
                if token_first < current_index:
                    raise Exception(
                        token_first,
                        "Overlapping tokens (%s, %s), something is wrong with the tokenizer!!!"
                        % (current_index, token_first))
                token_type = token.type.value if binary else token.type.name
                token_ranges.append([token_type, current_index, token_after])
                current_index = token_after
        except TokenizingError as e:
            return error(e)

        if len(token_ranges) > 0:
            last_token_range = token_ranges[-1]
            if last_token_range[2] < len(file_contents):
                last_token_range[2] += 1

        print("Tokenization took %s seconds" % (time_ - time.time()))
        return pack(['tokenize', token_ranges])
예제 #2
0
def tokenize(options):
    try:
        filename = options.filename


        code = open(filename, encoding='utf-8').read()
        stream = StringStream(code)

        parser = AnokyParser()

        if 'output' in options:
            output = options.output
            encoder = options.encoder
            filler_token_value = Tokens.WHITESPACE.value if options.binary else Tokens.WHITESPACE.name
            for token, first_index, index_after in parser.tokenize_with_intervals(stream):
                if token is None:
                    bytes_ = encoder((filler_token_value, first_index, index_after))
                else:
                    token_value = token.type.value if options.binary else token.type.name
                    bytes_ = encoder((token_value, first_index, index_after))
                output.write(bytes_)
        else:
            for token in parser.tokenize(stream):
                print(str(token))

    except CompilerError as e:
        print(e.trace)
예제 #3
0
    def tokenize(message: list) -> list:
        time_ = time.time()
        if not 3 <= len(message) <= 4:
            return error(
                "Tokenization request format is:\n input: ['tokenize', file_name:str, file_contents:str, binary=False]\n output: ['tokenize', token_ranges:list(list(token_code, first_index, index_after))]")
        file_name = message[1]
        file_contents = message[2]
        if not isinstance(file_name, str):
            return error('Tokenization request: "file_name" arg must be a string.')
        if not isinstance(file_contents, str):
            return error('Tokenization request: "file_contents" arg must be a string.')
        if VERBOSE:
            print("\tfile-name: " + file_name)
            print("\tfile-contents: " + (
            repr(file_contents) if len(file_contents) < 80 else repr(file_contents[0:80]) + " ..."))
        if len(message) == 4:
            binary = message[3]
            if not isinstance(file_contents, bool):
                return error('Tokenization request: "binary" arg must be a string.')
        else:
            binary = True

        stream = StringStream(file_contents, name=file_name)

        parser = AnokyParser()

        token_ranges = []
        current_index = 0
        try:
            for token in parser.tokenize(stream, emmit_restart_tokens=True):
                token_first = token.range.first_position.index
                token_after = token.range.position_after.index
                # if token_first > current_index:
                #     token_type = Tokens._TokenTypes.WHITESPACE.value if binary else Tokens._TokenTypes.WHITESPACE.name
                #     token_ranges.append([token_type, current_index, token_first])
                #     current_index = token_first
                # el
                if token_first < current_index:
                    raise Exception(token_first,
                                    "Overlapping tokens (%s, %s), something is wrong with the tokenizer!!!" % (
                                    current_index, token_first))
                token_type = token.type.value if binary else token.type.name
                token_ranges.append([token_type, current_index, token_after])
                current_index = token_after
        except TokenizingError as e:
            return error(e)

        if len(token_ranges) > 0:
            last_token_range = token_ranges[-1]
            if last_token_range[2] < len(file_contents):
                last_token_range[2] += 1

        print("Tokenization took %s seconds" % (time_ - time.time()))
        return pack(['tokenize', token_ranges])
예제 #4
0
    async def async_tokenize(id, incomming, outgoing):
        def my_send_message(msg):
            if VERBOSE: print("\treply: " + str(msg))
            return outgoing.push_message(pack(msg))

        def my_error(e):
            nonlocal outgoing
            if VERBOSE: print("\terror: " + str(e))
            return outgoing.push_message(error(e))

        # first message (see below for syntax)
        # It will give us the filename name and contents of the written code,
        # and also whether we should mark the first offset as being anything other than zero,
        # and the indentation level at which the code is written
        message = await incomming()

        if not 3 <= len(message) <= 5:
            return outgoing.push_message(
                error(
                    "Async tokenization request format is:\n"
                    " first message: ['async_tokenize', file_name:str, file_contents:str, first_offset:int = 0, indentation_level:int = 0]\n"
                    " first reply: ['async_tokenize', handler_id:int]\n"
                    " following messages: ['async_tokenize_next', handler_id:int]\n"
                    " reply: ['async_tokenize_next', token_code, first_index, index_after]\n"
                    " ending_message: ['close', handler_id:int]\n"
                    " reply: ['close']"
                    "at any moment, reply may be:"
                    "  ['async_tokenize_error', message:str, first_position?:int, position_after?:int]"
                ))

        file_name = message[1]
        file_contents = message[2]
        if not isinstance(file_name, str):
            return my_error(
                'Async tokenization request: "file_name" arg must be a string.'
            )
        if not isinstance(file_contents, str):
            return my_error(
                'Async tokenization request: "file_contents" arg must be a string.'
            )

        if VERBOSE:
            print("\tfile-name: " + file_name)
            print("\tfile-contents: " +
                  (repr(file_contents) if len(file_contents) < 80 else
                   repr(file_contents[0:80]) + " ..."))
            if len(message) >= 4: print("\toffset: %s " % message[3])
            if len(message) >= 5: print("\tindentation: %s" % message[4])

        # Get global offset of first character, if any
        if len(message) >= 4:
            shift = message[3]
            if not isinstance(shift, int):
                return my_error(
                    'Tokenization request: "first_offset" arg must be an integer.'
                )
        else:
            shift = 0

        # get indentation level of code, if any
        if len(message) >= 5:
            indentation_level = message[4]
            if not isinstance(indentation_level, int):
                return my_error(
                    'Tokenization request: "indentation_level" arg must be an integer.'
                )
        else:
            indentation_level = 0

        # reply with the id of this async tokenization handler
        my_send_message(['async_tokenize', id])

        # Now the tokenization actually begins
        # We will tokenize each token, and between tokens we wait for the request of the next token.

        # First we prepare the stream, with the right shift and indentation level
        stream = StringStream(file_contents, name=file_name)

        if indentation_level > 0:
            stream = IndentedCharacterStream(stream)
            stream.readn(indentation_level)
            stream.push()

        # Then we tokenize the given text,
        parser = AnokyParser()
        current_index = indentation_level
        try:
            for token in parser.tokenize(stream, emmit_restart_tokens=True):
                token_first = token.range.first_position.index
                token_after = token.range.position_after.index
                # if token_first > current_index:
                #     token_type = Tokens._TokenTypes.WHITESPACE.value
                #     # We wait for the next token request, and emit a whitespace filler to the outgoing socket
                #     message = await incomming()
                #     if VERBOSE: print("\tmessage: %s" % message)
                #     assert len(message) >= 2 and message[1] == id
                #     if message[0] == 'close':
                #         my_send_message(['close'])
                #         return
                #     elif message[0] == 'async_tokenize_next':
                #         my_send_message(['async_tokenize_next', token_type, current_index+shift, token_first+shift])
                #     else:
                #         return my_error("Unkown message for async_tokenize handler, '%s'." % message[0])
                #     current_index = token_first
                # el
                if token_first < current_index:
                    raise Exception(
                        token_first,
                        "Overlapping tokens (%s, %s), something is wrong with the tokenizer!!!"
                        % (current_index + shift, token_first + shift))
                token_type = token.type.value

                # Now that we know the next token type, we wait for the next token request,
                # and emit it to the outgoing socket
                message = await incomming()
                if VERBOSE:
                    print("\tmessage: " + str(message))
                assert len(message) >= 2 and message[1] == id
                if message[0] == 'close':
                    my_send_message(['close'])
                    return
                elif message[0] == 'async_tokenize_next':
                    my_send_message([
                        'async_tokenize_next', token_type,
                        current_index + shift, token_after + shift
                    ])
                else:
                    return my_error(
                        "Unkown message for async_tokenize handler, '%s'." %
                        message[0])
                current_index = token_after
        except TokenizingError as e:
            return my_error(e)

        while True:
            message = await incomming()
            if VERBOSE: print("\tmessage: %s" % message)
            assert len(message) >= 2 and message[1] == id
            if message[0] == 'close':
                my_send_message(['close'])
                return
            elif message[0] == 'async_tokenize_next':
                my_send_message(['async_tokenize_next', -1, -1, -1])
            else:
                return my_error(
                    "Unkown message for async_tokenize handler, '%s'." %
                    message[0])

        return
예제 #5
0
    async def async_tokenize(id, incomming, outgoing):
        def my_send_message(msg):
            if VERBOSE: print("\treply: " + str(msg))
            return outgoing.push_message(pack(msg))

        def my_error(e):
            nonlocal outgoing
            if VERBOSE: print("\terror: " + str(e))
            return outgoing.push_message(error(e))

        # first message (see below for syntax)
        # It will give us the filename name and contents of the written code,
        # and also whether we should mark the first offset as being anything other than zero,
        # and the indentation level at which the code is written
        message = await incomming()

        if not 3 <= len(message) <= 5:
            return outgoing.push_message(error(
                "Async tokenization request format is:\n"
                " first message: ['async_tokenize', file_name:str, file_contents:str, first_offset:int = 0, indentation_level:int = 0]\n"
                " first reply: ['async_tokenize', handler_id:int]\n"

                " following messages: ['async_tokenize_next', handler_id:int]\n"
                " reply: ['async_tokenize_next', token_code, first_index, index_after]\n"

                " ending_message: ['close', handler_id:int]\n"
                " reply: ['close']"

                "at any moment, reply may be:"
                "  ['async_tokenize_error', message:str, first_position?:int, position_after?:int]"))

        file_name = message[1]
        file_contents = message[2]
        if not isinstance(file_name, str):
            return my_error('Async tokenization request: "file_name" arg must be a string.')
        if not isinstance(file_contents, str):
            return my_error('Async tokenization request: "file_contents" arg must be a string.')

        if VERBOSE:
            print("\tfile-name: " + file_name)
            print("\tfile-contents: " + (
            repr(file_contents) if len(file_contents) < 80 else repr(file_contents[0:80]) + " ..."))
            if len(message) >= 4: print("\toffset: %s " % message[3])
            if len(message) >= 5: print("\tindentation: %s" % message[4])

        # Get global offset of first character, if any
        if len(message) >= 4:
            shift = message[3]
            if not isinstance(shift, int):
                return my_error('Tokenization request: "first_offset" arg must be an integer.')
        else:
            shift = 0

        # get indentation level of code, if any
        if len(message) >= 5:
            indentation_level = message[4]
            if not isinstance(indentation_level, int):
                return my_error('Tokenization request: "indentation_level" arg must be an integer.')
        else:
            indentation_level = 0

        # reply with the id of this async tokenization handler
        my_send_message(['async_tokenize', id])


        # Now the tokenization actually begins
        # We will tokenize each token, and between tokens we wait for the request of the next token.

        # First we prepare the stream, with the right shift and indentation level
        stream = StringStream(file_contents, name=file_name)

        if indentation_level > 0:
            stream = IndentedCharacterStream(stream)
            stream.readn(indentation_level)
            stream.push()

        # Then we tokenize the given text,
        parser = AnokyParser()
        current_index = indentation_level
        try:
            for token in parser.tokenize(stream, emmit_restart_tokens=True):
                token_first = token.range.first_position.index
                token_after = token.range.position_after.index
                # if token_first > current_index:
                #     token_type = Tokens._TokenTypes.WHITESPACE.value
                #     # We wait for the next token request, and emit a whitespace filler to the outgoing socket
                #     message = await incomming()
                #     if VERBOSE: print("\tmessage: %s" % message)
                #     assert len(message) >= 2 and message[1] == id
                #     if message[0] == 'close':
                #         my_send_message(['close'])
                #         return
                #     elif message[0] == 'async_tokenize_next':
                #         my_send_message(['async_tokenize_next', token_type, current_index+shift, token_first+shift])
                #     else:
                #         return my_error("Unkown message for async_tokenize handler, '%s'." % message[0])
                #     current_index = token_first
                # el
                if token_first < current_index:
                    raise Exception(token_first,
                                    "Overlapping tokens (%s, %s), something is wrong with the tokenizer!!!" % (
                                    current_index+shift, token_first+shift))
                token_type = token.type.value

                # Now that we know the next token type, we wait for the next token request,
                # and emit it to the outgoing socket
                message = await incomming()
                if VERBOSE:
                    print("\tmessage: " + str(message))
                assert len(message) >= 2 and message[1] == id
                if message[0] == 'close':
                    my_send_message(['close'])
                    return
                elif message[0] == 'async_tokenize_next':
                    my_send_message(['async_tokenize_next', token_type, current_index+shift, token_after+shift])
                else:
                    return my_error("Unkown message for async_tokenize handler, '%s'." % message[0])
                current_index = token_after
        except TokenizingError as e:
            return my_error(e)

        while True:
            message = await incomming()
            if VERBOSE: print("\tmessage: %s" % message)
            assert len(message) >= 2 and message[1] == id
            if message[0] == 'close':
                my_send_message(['close'])
                return
            elif message[0] == 'async_tokenize_next':
                my_send_message(['async_tokenize_next', -1, -1, -1])
            else:
                return my_error("Unkown message for async_tokenize handler, '%s'." % message[0])

        return