Example #1
0
 def forward_until_new(s):
     """Catch the first non-whitespace character"""
     t = TokenWithPosition('', s.peek().position)
     while (s.hasNext() and any(
         [s.peek().startswith(substr) for substr in string.whitespace])
            and not t.strip(" ").endswith('\n')):
         t += s.forward(1)
     return t
Example #2
0
def tokenize_math(text):
    r"""Prevents math from being tokenized.

    :param Buffer text: iterator over line, with current position

    >>> b = Buffer('$$\min_x$$ \command')
    >>> tokenize_math(b)
    '$$\\min_x$$'
    """

    def escaped_dollar():
        return text.peek() == '$' and result[-1] == '\\'

    def end_detected():
        return (text.peek((0, len(starter))) == starter
                and not escaped_dollar())

    result = TokenWithPosition('', text.position)
    if text.startswith('$'):
        starter = '$$' if text.startswith('$$') else '$'
        result += text.forward(len(starter))
        while text.hasNext() and not end_detected():
            result += next(text)
        if not text.startswith(starter):
            raise EOFError('Expecting %s. Instead got %s' % (
                starter, text.peek((0, 5))))
        result += text.forward(len(starter))
        return result
Example #3
0
def tokenize_string(text, delimiters=None):
    r"""Process a string of text

    :param Buffer text: iterator over line, with current position
    :param Union[None,iterable,str] delimiters: defines the delimiters

    >>> tokenize_string(Buffer('hello'))
    'hello'
    >>> b = Buffer(r'hello again\command')
    >>> tokenize_string(b)
    'hello again'
    >>> print(b.peek())
    \
    >>> print(tokenize_string(Buffer(r'0 & 1 \\\command')))
    0 & 1 \\
    """
    if delimiters is None:
        delimiters = ALL_TOKENS
    result = TokenWithPosition('', text.position)
    for c in text:
        if c == '\\' and str(text.peek()) in delimiters and str(
                c + text.peek()) not in delimiters:
            c += next(text)
        elif str(c) in delimiters:  # assumes all tokens are single characters
            text.backward(1)
            return result
        result += c
        if text.peek((0, 2)) == '\\\\':
            result += text.forward(2)
        if text.peek((0, 2)) == '\n\n':
            result += text.forward(2)
            return result
    return result
Example #4
0
def read_tex(src):
    r"""Read next expression from buffer

    :param Buffer src: a buffer of tokens
    """
    c = next(src)
    if c.startswith('$'):
        name = '$$' if c.startswith('$$') else '$'
        return TexEnv(name, [c[len(name):-len(name)]], nobegin=True)
    if c == '\\':
        if src.peek().startswith('item '):
            mode, expr = 'command', TexCmd(src.peek()[:4], (),
                TokenWithPosition.join(next(src).split(' ')[1:], glue=' ').strip())
        elif src.peek() == 'begin':
            mode, expr = next(src), TexEnv(Arg.parse(src.forward(3)).value)
        else:
            mode, candidate, expr = 'command', next(src), None
            for i, c in enumerate(candidate):
                if c.isspace():
                    expr = TexCmd(candidate[:i], (), candidate[i+1:])
                    break
            if not expr:
                expr = TexCmd(candidate)
        while src.peek() in ARG_START_TOKENS:
            expr.args.append(read_tex(src))
        if mode == 'begin':
            read_env(src, expr)
        if src.startswith('$'):
            expr.add_contents(read_tex(src))
        return expr
    if c.startswith('\\'):
        return TexCmd(c[1:])
    if c in ARG_START_TOKENS:
        return read_arg(src, c)
    return c
Example #5
0
def read_tex(src):
    r"""Read next expression from buffer

    :param Buffer src: a buffer of tokens
    """
    c = next(src)
    if c.startswith('$'):
        name = '$$' if c.startswith('$$') else '$'
        expr = TexEnv(name, [], nobegin=True)
        return read_math_env(src, expr)
    if c.startswith('\\'):
        command = TokenWithPosition(c[1:], src.position)
        if command == 'item':
            extra = src.forward_until(lambda string: any(
                [string.startswith(s) for s in {'\n', '\end', '\item'}]))
            mode, expr = 'command', TexCmd(
                command, (),
                TokenWithPosition.join(extra.split(' '), glue=' ').strip())
        elif command == 'begin':
            mode, expr, _ = 'begin', TexEnv(src.peek(1)), src.forward(3)
        else:
            mode, expr = 'command', TexCmd(command)

        # TODO: allow only one line break
        # TODO: should really be handled by tokenizer
        candidate_index = src.num_forward_until(lambda s: not s.isspace())
        src.forward(candidate_index)

        while src.peek() in ARG_START_TOKENS:
            expr.args.append(read_tex(src))
        if not expr.args:
            src.backward(candidate_index)
        if mode == 'begin':
            read_env(src, expr)
        return expr
    if c in ARG_START_TOKENS:
        return read_arg(src, c)
    return c
Example #6
0
def tokenize_math(text):
    r"""Prevents math from being tokenized.

    :param Buffer text: iterator over line, with current position

    >>> b = Buffer('$\min_x$ \command')
    >>> tokenize_math(b)
    '$'
    >>> b = Buffer('$$\min_x$$ \command')
    >>> tokenize_math(b)
    '$$'
    """
    if text.startswith('$') and (text.position == 0 or text.peek(-1) != '\\'):
        starter = '$$' if text.startswith('$$') else '$'
        return TokenWithPosition(text.forward(len(starter)), text.position)
Example #7
0
def tokenize_line_comment(text):
    r"""Process a line comment

    :param Buffer text: iterator over line, with current position

    >>> tokenize_line_comment(Buffer('hello %world'))
    >>> tokenize_line_comment(Buffer('%hello world'))
    '%hello world'
    >>> tokenize_line_comment(Buffer('%hello\n world'))
    '%hello'
    """
    result = TokenWithPosition('', text.position)
    if text.peek() == '%' and text.peek(-1) != '\\':
        result += text.forward(1)
        while text.peek() != '\n' and text.hasNext():
            result += text.forward(1)
        return result
Example #8
0
def read_tex(src):
    r"""Read next expression from buffer

    :param Buffer src: a buffer of tokens
    """
    c = next(src)
    if c.startswith('%'):
        return c
    if c.startswith('$'):
        name = '$$' if c.startswith('$$') else '$'
        expr = TexEnv(name, [], nobegin=True)
        return read_math_env(src, expr)
    if c.startswith('\\'):
        command = TokenWithPosition(c[1:], src.position)
        if command == 'item':
            extra, arg = read_item(src)
            mode, expr = 'command', TexCmd(command, arg, extra)
        elif command == 'begin':
            mode, expr, _ = 'begin', TexEnv(src.peek(1)), src.forward(3)
        else:
            mode, expr = 'command', TexCmd(command)

        # TODO: should really be handled by tokenizer
        candidate_index = src.num_forward_until(lambda s: not s.isspace())
        src.forward(candidate_index)

        line_breaks = 0
        while (src.peek() in ARG_START_TOKENS
               or (src.peek() == '\n') and line_breaks == 0):
            if src.peek() == '\n':
                # Advance buffer if first newline
                line_breaks += 1
                next(src)
            else:
                line_breaks = 0
                expr.args.append(read_tex(src))
        if not expr.args:
            src.backward(candidate_index)
        if mode == 'begin':
            read_env(src, expr)
        return expr
    if c in ARG_START_TOKENS:
        return read_arg(src, c)
    return c
Example #9
0
def read_tex(src):
    r"""Read next expression from buffer

    :param Buffer src: a buffer of tokens
    """
    c = next(src)
    if c.startswith('%'):
        return c
    elif c.startswith('$'):
        name = '$$' if c.startswith('$$') else '$'
        expr = TexEnv(name, [], nobegin=True)
        return read_math_env(src, expr)
    elif c.startswith('\[') or c.startswith("\("):
        if c.startswith('\['):
            name = 'displaymath'
            begin = '\['
            end = '\]'
        else:
            name = "math"
            begin = "\("
            end = "\)"

        expr = TexEnv(name, [], nobegin=True, begin=begin, end=end)
        return read_math_env(src, expr)
    elif c.startswith('\\'):
        command = TokenWithPosition(c[1:], src.position)
        if command == 'item':
            contents, arg = read_item(src)
            mode, expr = 'command', TexCmd(command, contents, arg)
        elif command == 'begin':
            mode, expr, _ = 'begin', TexEnv(src.peek(1)), src.forward(3)
        else:
            mode, expr = 'command', TexCmd(command)

        expr.args = read_args(src, expr.args)

        if mode == 'begin':
            read_env(src, expr)
        return expr
    if c in ARG_START_TOKENS:
        return read_arg(src, c)
    return c
Example #10
0
def read_item(src):
    r"""Read the item content.

    There can be any number of whitespace characters between \item and the first
    non-whitespace character. However, after that first non-whitespace
    character, the item can only tolerate one successive line break at a time.

    \item can also take an argument.

    :param Buffer src: a buffer of tokens
    :return: contents of the item and any item arguments
    """
    stringify = lambda s: TokenWithPosition.join(s.split(' '), glue=' ')

    def criterion(s):
        """Catch the first non-whitespace character"""
        return not any([s.startswith(substr) for substr in string.whitespace])

    # Item argument such as in description environment
    arg = []
    if src.peek() in ARG_START_TOKENS:
        c = next(src)
        arg.append(read_arg(src, c))
    last = stringify(src.forward_until(criterion))
    if last.startswith(' '):
        last = last[1:]
    extra = [last]

    while src.hasNext() and not src.startswith('\n\n') and \
            not src.startswith('\item') and \
            not src.startswith('\end') and \
            not (hasattr(last, 'endswith') and last.endswith('\n\n')
                 and len(extra) > 1):
        last = read_tex(src)
        extra.append(last)
    return extra, arg
Example #11
0
 def stringify(s):
     return TokenWithPosition.join(s.split(' '), glue=' ')
Example #12
0
def read_tex(src):
    r"""Read next expression from buffer

    :param Buffer src: a buffer of tokens
    """
    c = next(src)
    if c.startswith('%'):
        return c
    elif c.startswith('$'):
        name = '$$' if c.startswith('$$') else '$'
        expr = TexEnv(name, [], nobegin=True)
        return read_math_env(src, expr)
    elif c.startswith('\[') or c.startswith("\("):
        if c.startswith('\['):
            name = 'displaymath'
            begin = '\['
            end = '\]'
        else:
            name = "math"
            begin = "\("
            end = "\)"

        expr = TexEnv(name, [], nobegin=True, begin=begin, end=end)
        return read_math_env(src, expr)
    elif c.startswith('\\'):
        command = TokenWithPosition(c[1:], src.position)
        if command == 'item':
            extra, arg, stuff = read_item(src)
            mode, expr = 'command', TexCmd(command, arg, extra, stuff)
        elif command == 'begin':
            mode, expr, _ = 'begin', TexEnv(src.peek(1)), src.forward(3)
        else:
            mode, expr = 'command', TexCmd(command)

        # TODO: should really be handled by tokenizer
        stuff_index, candidate_index = 0, src.num_forward_until(
            lambda s: not s.isspace())
        while src.peek().isspace():
            stuff_index += 1
            expr.stuff.append(read_tex(src))

        line_breaks = 0
        while src.peek(
        ) in ARG_START_TOKENS or src.peek().isspace() and line_breaks == 0:
            space_index = src.num_forward_until(lambda s: not s.isspace())
            if space_index > 0:
                line_breaks += 1
                if src.peek((0, space_index)).count("\n") <= 1 and src.peek(
                        space_index) in ARG_START_TOKENS:
                    expr.stuff.append(read_tex(src))
            else:
                line_breaks = 0
                tex_text = read_tex(src)
                expr.args.append(tex_text)
                expr.stuff.append(tex_text)
        if not expr.args:
            if stuff_index > 0:
                del expr.stuff[-stuff_index:]
            src.backward(candidate_index)
        if mode == 'begin':
            read_env(src, expr)
        return expr
    if c in ARG_START_TOKENS:
        return read_arg(src, c)
    return c