コード例 #1
0
ファイル: parsers.py プロジェクト: kalkin/nanolp
    def tokens(self, text):
        xhtmlsaxhandler = XHTMLSaxHandler(styles=self.styles, tags=self.tags)
        xhtmlsaxerrhandler = XHTMLSaxErrorHandler(self, self.ignore)
        # next conversion seems bug in python (why need bytes, not str?!)
        if type(text) is str:
            parsetext = core.strtobytes23(text)
        else:
            parsetext = text
        sax.parseString(parsetext, xhtmlsaxhandler, xhtmlsaxerrhandler)
        text = ''.join(xhtmlsaxhandler.text)

        #_inlcode_re = r'@start_inline@(.*?)@end_inline@'
        _blkcode_re = r'@start_block@(.*?)@end_block@'

        #inlcode_re = re.compile(_inlcode_re, re.MULTILINE)
        blkcode_re = re.compile(_blkcode_re, re.DOTALL | re.MULTILINE)

        cmds = core.Cmd.syntax.findtokens('cmddef', text)
        #inlcodes = inlcode_re.finditer(text)
        blkcodes = blkcode_re.finditer(text)

        tokens = []

        for m in cmds:
            token = core.CmdToken(
                XHTMLSaxHandler._decode_spec_chars(m.group(1)), m.start(0),
                m.end(0))
            tokens.append(token)
        #for m in inlcodes:
        #    # XXX break line in HTML is coded with paragraph styling and this linearizing
        #    # does not help
        #    tokentext = XHTMLSaxHandler._decode_spec_chars(m.group(1))
        #    tokentext = core.InlCodeToken.linearize(tokentext)
        #    token = core.InlCodeToken(tokentext, m.start(0), m.end(0))
        #    tokens.append(token)
        for m in blkcodes:
            tokentext = m.group(1).lstrip('\n').rstrip('\n ')
            tokentext = core.deltextindent(tokentext)
            token = core.BlkCodeToken(
                XHTMLSaxHandler._decode_spec_chars(tokentext), m.start(0),
                m.end(0))
            tokens.append(token)

        tokens.sort(key=lambda tok: tok.start)
        tokens.append(core.EndToken(None))
        return tokens
コード例 #2
0
ファイル: parsers.py プロジェクト: kalkin/nanolp
    def tokens(self, text):
        oosaxhandler = OOSaxHandler(style=self.style)
        # next conversion seems bug in python (why need bytes, not str?!)
        if type(text) is str:
            parsetext = core.strtobytes23(text)
        else:
            parsetext = text
        sax.parseString(parsetext, oosaxhandler)
        text = ''.join(oosaxhandler.text)

        _inlcode_re = r'@start_inline@(.*?)@end_inline@'
        _blkcode_re = r'@start_block@(.*?)@end_block@'

        inlcode_re = re.compile(_inlcode_re, re.MULTILINE)
        blkcode_re = re.compile(_blkcode_re, re.MULTILINE)

        cmds = core.Cmd.syntax.findtokens('cmddef', text)
        inlcodes = inlcode_re.finditer(text)
        blkcodes = blkcode_re.finditer(text)

        tokens = []

        for m in cmds:
            token = core.CmdToken(OOSaxHandler._decode_spec_chars(m.group(1)),
                                  m.start(0), m.end(0))
            tokens.append(token)
        for m in inlcodes:
            # XXX break line in OO is coded with paragraph styling and this linearizing
            # does not help
            tokentext = OOSaxHandler._decode_spec_chars(m.group(1))
            tokentext = core.InlCodeToken.linearize(tokentext)
            token = core.InlCodeToken(tokentext, m.start(0), m.end(0))
            tokens.append(token)
        for m in blkcodes:
            token = core.BlkCodeToken(
                OOSaxHandler._decode_spec_chars(m.group(1)), m.start(0),
                m.end(0))
            tokens.append(token)

        tokens.sort(key=lambda tok: tok.start)
        tokens.append(core.EndToken(None))
        #print [t.text for t in tokens]
        return tokens
コード例 #3
0
ファイル: parsers.py プロジェクト: kalkin/nanolp
    def tokens(self, text):
        """Returns tokens

        >>> text = \
        'Example of Literate Programming in Markdown\\n' \
        '===========================================\\n\\n' \
        \
        'Code 1\\n' \
        '------\\n\\n' \
        \
        'Test if variable is negative looks like <<c.isneg>>: `if a < 0`.\\n' \
        'So, we can write absolute function <<c.fun>>:\\n\\n' \
        \
        '    def fun(x):\\n' \
        '        <<=c.isneg,a:v>>:\\n' \
        '            a += 100\\n' \
        '            return -a\\n\\n' \
        \
        'And <<c.sum>>:\\n\\n' \
        \
        '    def sum(x, y):\\n' \
        '        return x+y\\n\\n' \
        \
        'not code\\n' \
        'not code\\n\\n' \
         \
        'Lalalalalalal\\n' \
        'Lalalalalalal\\n'
        >>> p = MDParser()
        >>> toks = p.tokens(text)
        >>> [tok.__class__.__name__ for tok in toks]
        ['CmdToken', 'InlCodeToken', 'CmdToken', 'BlkCodeToken', 'CmdToken', 'BlkCodeToken', 'EndToken']
        >>> toks[0].text
        'c.isneg'
        >>> toks[1].text
        'if a < 0'
        >>> toks[2].text
        'c.fun'
        >>> toks[3].text.startswith('def fun')
        True
        >>> toks[3].text.endswith('return -a')
        True
        """
        # left padding fragment of code-block
        indents = (4 * ' ', '\t')  # possible padding
        indents = '|'.join(re.escape(s) for s in indents)

        _inlcode_re = r'`([^`]+)`'
        _blkcode_re = r'^\n(?P<code>((%s)(.*?)\n|\n)+)$' % indents
        _blkcode_lstrip_re = '^(%s)' % indents

        inlcode_re = re.compile(_inlcode_re)
        blkcode_re = re.compile(_blkcode_re, re.MULTILINE)
        blkcode_lstrip_re = re.compile(_blkcode_lstrip_re, re.MULTILINE)

        cmds = core.Cmd.syntax.findtokens('cmddef', text)
        inlcodes = inlcode_re.finditer(text)
        blkcodes = blkcode_re.finditer(text)

        tokens = []

        for m in cmds:
            token = core.CmdToken(m.group(1), m.start(0), m.end(0))
            tokens.append(token)
        for m in inlcodes:
            tokentext = core.InlCodeToken.linearize(m.group(1))
            token = core.InlCodeToken(tokentext, m.start(0), m.end(0))
            tokens.append(token)
        for m in blkcodes:
            # find each block and replace first left indent with ''
            tokentext = m.group('code').strip('\n')
            tokentext = blkcode_lstrip_re.sub('', tokentext)
            token = core.BlkCodeToken(tokentext, m.start(0), m.end(0))
            tokens.append(token)

        tokens.sort(key=lambda tok: tok.start)
        tokens.append(core.EndToken(None))
        return tokens
コード例 #4
0
ファイル: parsers.py プロジェクト: kalkin/nanolp
    def tokens(self, text):
        """Returns tokens

        >>> text = \
        '\\\\documentclass[12pt]{article}\\n' \
        '\\\\usepackage{amsmath}\\n' \
        '\\\\title{\\\\LaTeX}\\n' \
        '\\\\date{}\\n' \
        '\\\\begin{document}\\n' \
        '  \\\\maketitle\\n' \
        '  \\\\LaTeX{} is a document preparation system for the \\\\TeX{}\\n' \
        '\\n' \
        '  Testing of negative value <<isneg>>: \\\\verb#if a < 0#. Signature will\\n' \
        '  be <<fn.abs.decl>>: \\\\verb!int abs(int a)!. And now:\\n' \
        '  function absolute <<fn.abs>>:\\n' \
        '\\n' \
        '  \\\\begin{verbatim}\\n' \
        '    <<=fn.abs.decl>>\\n' \
        '        if (<<=isneg, x:a>>) return -a;\\n' \
        '        else return a;\\n' \
        '    }\\n' \
        '  \\\\end{verbatim}\\n' \
        '\\n' \
        '  % This is a comment; it will not be shown in the final output.\\n' \
        '  % The following shows a little of the typesetting power of LaTeX:\\n' \
        '  \\\\begin{align}\\n' \
        '    m &= \\\\frac{m_0}{\\\\sqrt{1-\\\\frac{v^2}{c^2}}}\\n' \
        '  \\\\end{align}\\n' \
        '\\\\end{document}'
        >>> p = TeXParser()
        >>> toks = p.tokens(text)
        >>> [tok.__class__.__name__ for tok in toks]
        ['CmdToken', 'InlCodeToken', 'CmdToken', 'InlCodeToken', 'CmdToken', 'BlkCodeToken', 'EndToken']
        >>> toks[0].text
        'isneg'
        >>> toks[1].text
        'if a < 0'
        >>> toks[2].text
        'fn.abs.decl'
        >>> toks[3].text == 'int abs(int a)'
        True
        >>> toks[5].text.startswith('<<=fn.abs.decl>>')
        True
        >>> toks[5].text.endswith('}')
        True
        """
        _inlcmds = '|'.join(re.escape(s) for s in TeXParser.inlcmds)
        # text inside inline chunk
        _inltext = r'{(?P<code1>.+?)}|(?P<s>[+#!])(?P<code2>.+?)(?P=s)'
        _blkcmds = '|'.join(re.escape(s) for s in TeXParser.blkcmds)

        _inlcode_re = r'\\(%s)%s' % (_inlcmds, _inltext)
        _blkcode_re = r'\\begin(\[.+?\])?{(%s)}(?P<code>.*?)\\end{(%s)}' % (
            _blkcmds, _blkcmds)

        inlcode_re = re.compile(_inlcode_re)
        blkcode_re = re.compile(_blkcode_re, re.DOTALL)

        cmds = core.Cmd.syntax.findtokens('cmddef', text)
        inlcodes = inlcode_re.finditer(text)
        blkcodes = blkcode_re.finditer(text)

        tokens = []

        for m in cmds:
            token = core.CmdToken(m.group(1), m.start(0), m.end(0))
            tokens.append(token)
        for m in inlcodes:
            groups = m.groupdict()
            g = 'code1' if groups['code1'] else 'code2'
            tokentext = core.InlCodeToken.linearize(m.group(g))
            token = core.InlCodeToken(tokentext, m.start(0), m.end(0))
            tokens.append(token)
        for m in blkcodes:
            # find each block and replace first left indent with ''
            tokentext = m.group('code').lstrip('\n').rstrip('\n ')
            tokentext = core.deltextindent(tokentext)
            token = core.BlkCodeToken(tokentext, m.start(0), m.end(0))
            tokens.append(token)

        tokens.sort(key=lambda tok: tok.start)
        tokens.append(core.EndToken(None))
        return tokens
コード例 #5
0
ファイル: parsers.py プロジェクト: kalkin/nanolp
    def tokens(self, text):
        """Returns tokens

        >>> text = \
        'Example of Literate Programming in Asciidoc\\n' \
        '===========================================\\n\\n' \
        \
        'Code 1\\n' \
        '------\\n\\n' \
        \
        'Test if variable is negative looks like <<c.isneg>>: +if a < 0+.\\n' \
        'So, we can write absolute function <<c.fun>>\\n' \
        '[source, python]\\n' \
        '----\\n' \
        '    def fun(x):\\n' \
        '        <<=c.isneg,a:v>>:\\n' \
        '            a += 100\\n' \
        '            return -a\\n\\n' \
        '----\\n\\n' \
        'And <<c.sum>>\\n\\n' \
        '----\\n' \
        '    def sum(x, y):\\n\\n' \
        \
        '        return x+y\\n\\n' \
        '    x += 1\\n' \
        '----'
        >>> p = AsciiDocParser()
        >>> toks = p.tokens(text)
        >>> [tok.__class__.__name__ for tok in toks]
        ['CmdToken', 'InlCodeToken', 'CmdToken', 'BlkCodeToken', 'CmdToken', 'BlkCodeToken', 'EndToken']
        >>> toks[0].text
        'c.isneg'
        >>> toks[1].text
        'if a < 0'
        >>> toks[2].text
        'c.fun'
        >>> toks[3].text.startswith('def fun')
        True
        >>> toks[3].text.endswith('        return -a')
        True
        >>> toks[5].text.startswith('def sum')
        True
        >>> toks[5].text.endswith('x += 1')
        True
        """

        _inlcode_re = r'\+([^\n\+]+)\+'
        _blkcode_re = r'(?:\[source[^]]*?]\n|\n\n)-{4,}(?P<code>.*?)\n-{4,}'

        inlcode_re = re.compile(_inlcode_re)
        blkcode_re = re.compile(_blkcode_re, re.DOTALL)

        cmds = core.Cmd.syntax.findtokens('cmddef', text)
        inlcodes = inlcode_re.finditer(text)
        blkcodes = blkcode_re.finditer(text)

        tokens = []

        for m in cmds:
            token = core.CmdToken(m.group(1), m.start(0), m.end(0))
            tokens.append(token)
        for m in inlcodes:
            tokentext = core.InlCodeToken.linearize(m.group(1))
            token = core.InlCodeToken(tokentext, m.start(0), m.end(0))
            tokens.append(token)
        for m in blkcodes:
            # find each block and replace first left indent with ''
            tokentext = m.group('code').strip('\n')
            tokentext = core.deltextindent(tokentext)
            token = core.BlkCodeToken(tokentext, m.start(0), m.end(0))
            tokens.append(token)

        tokens.sort(key=lambda tok: tok.start)
        tokens.append(core.EndToken(None))
        return tokens
コード例 #6
0
ファイル: parsers.py プロジェクト: kalkin/nanolp
    def tokens(self, text):
        """Returns tokens

        >>> text = \
        '= Example of Literate Programming in Txt2Tags =\\n' \
        \
        '== Code 1 ==\\n' \
        \
        'Test if variable is negative looks like <<c.isneg>>: ``if a < 0``.\\n' \
        'So, we can write absolute function <<c.fun>>\\n' \
        '```\\n' \
        '    def fun(x):\\n' \
        '        <<=c.isneg,a:v>>:\\n' \
        '            a += 100\\n' \
        '            return -a\\n\\n' \
        '```\\n' \
        'And <<c.sum>>\\n' \
        '```\\n' \
        '    def sum(x, y):\\n\\n' \
        \
        '        return x+y\\n\\n' \
        '    x += 1\\n' \
        '```\\n' \
        'Last chunk is <<c.run>>\\n' \
        '``` $ ls -F1'
        >>> p = Txt2TagsParser()
        >>> toks = p.tokens(text)
        >>> [tok.__class__.__name__ for tok in toks]
        ['CmdToken', 'InlCodeToken', 'CmdToken', 'BlkCodeToken', 'CmdToken', 'BlkCodeToken', 'CmdToken', 'BlkCodeToken', 'EndToken']
        >>> toks[0].text
        'c.isneg'
        >>> toks[1].text
        'if a < 0'
        >>> toks[2].text
        'c.fun'
        >>> toks[3].text.startswith('def fun')
        True
        >>> toks[3].text.endswith('        return -a')
        True
        >>> toks[5].text.startswith('def sum')
        True
        >>> toks[5].text.endswith('x += 1')
        True
        >>> toks[7].text
        '$ ls -F1'
        """

        _inlcode_re = r'``([^\n`]+)``'
        _blkcode_re1 = r'(?:\n|^)```(?P<code>[^\n]+)(?:\n|$)'
        _blkcode_re2 = r'(?:\n|^)```\n+(?P<code>.*?)\n```(?:\n|$)'

        inlcode_re = re.compile(_inlcode_re)
        blkcode_re1 = re.compile(_blkcode_re1, re.DOTALL)
        blkcode_re2 = re.compile(_blkcode_re2, re.DOTALL)

        cmds = core.Cmd.syntax.findtokens('cmddef', text)
        inlcodes = inlcode_re.finditer(text)
        blkcodes = itertools.chain(blkcode_re1.finditer(text),
                                   blkcode_re2.finditer(text))

        tokens = []

        for m in cmds:
            token = core.CmdToken(m.group(1), m.start(0), m.end(0))
            tokens.append(token)
        for m in inlcodes:
            tokentext = core.InlCodeToken.linearize(m.group(1))
            token = core.InlCodeToken(tokentext, m.start(0), m.end(0))
            tokens.append(token)
        for m in blkcodes:
            # find each block and replace first left indent with ''
            tokentext = m.group('code').strip('\n')
            tokentext = core.deltextindent(tokentext)
            token = core.BlkCodeToken(tokentext, m.start(0), m.end(0))
            tokens.append(token)

        tokens.sort(key=lambda tok: tok.start)
        tokens.append(core.EndToken(None))
        return tokens
コード例 #7
0
ファイル: parsers.py プロジェクト: kalkin/nanolp
    def tokens(self, text):
        """Returns tokens

        >>> text = \
        '= Example of Literate Programming in Creole =\\n' \
        \
        '== Code 1 ==\\n' \
        \
        'Test if variable is negative looks like <<c.isneg>>: {{{if a < 0}}}.\\n' \
        'So, we can write absolute function <<c.fun>>:\\n\\n' \
        '{{{\\n' \
        '    def fun(x):\\n' \
        '        <<=c.isneg,a:v>>:\\n' \
        '            a += 100\\n' \
        '            return -a }}}\\n\\n' \
        \
        'And <<c.sum>>:\\n\\n' \
        \
        '{{{    def sum(x, y):\\n' \
        '        return x+y }}}\\n\\n' \
        \
        'not code\\n' \
        'not code\\n\\n' \
         \
        'Lalalalalalal\\n' \
        'Lalalalalalal\\n'
        >>> p = CreoleParser()
        >>> toks = p.tokens(text)
        >>> [tok.__class__.__name__ for tok in toks]
        ['CmdToken', 'BlkCodeToken', 'CmdToken', 'BlkCodeToken', 'CmdToken', 'BlkCodeToken', 'EndToken']
        >>> toks[0].text
        'c.isneg'
        >>> toks[1].text
        'if a < 0'
        >>> toks[2].text
        'c.fun'
        >>> toks[3].text.startswith('    def fun')
        True
        >>> toks[3].text.endswith('return -a')
        True
        """

        # XXX Only block chunks, inline and block Creole chunks are the same.
        # In real Creole distinguish block|inline chunks, but it's not
        # valuable for LP
        _blkcode_re = r'{{{\n*(?P<code>.*?)[\ \n]*}}}'

        blkcode_re = re.compile(_blkcode_re, re.DOTALL | re.MULTILINE)

        cmds = core.Cmd.syntax.findtokens('cmddef', text)
        blkcodes = blkcode_re.finditer(text)

        tokens = []

        for m in cmds:
            token = core.CmdToken(m.group(1), m.start(0), m.end(0))
            tokens.append(token)
        for m in blkcodes:
            # find each block and replace first left indent with ''
            tokentext = m.group('code')
            token = core.BlkCodeToken(tokentext, m.start(0), m.end(0))
            tokens.append(token)

        tokens.sort(key=lambda tok: tok.start)
        tokens.append(core.EndToken(None))
        return tokens