Ejemplo n.º 1
0
    def tokens(self, text):
        xhtmlsaxhandler = XHTMLSaxHandler(styles=self.styles, tags=self.tags)
        xhtmlsaxerrhandler = XHTMLSaxErrorHandler(self, self.ignore)
        # next conversion seems bug in python (why need bytes, not str?!)
        if type(text) is str:
            parsetext = core.strtobytes23(text)
        else:
            parsetext = text
        sax.parseString(parsetext, xhtmlsaxhandler, xhtmlsaxerrhandler)
        text = ''.join(xhtmlsaxhandler.text)

        #_inlcode_re = r'@start_inline@(.*?)@end_inline@'
        _blkcode_re = r'@start_block@(.*?)@end_block@'

        #inlcode_re = re.compile(_inlcode_re, re.MULTILINE)
        blkcode_re = re.compile(_blkcode_re, re.DOTALL | re.MULTILINE)

        cmds = core.Cmd.syntax.findtokens('cmddef', text)
        #inlcodes = inlcode_re.finditer(text)
        blkcodes = blkcode_re.finditer(text)

        tokens = []

        for m in cmds:
            token = core.CmdToken(
                XHTMLSaxHandler._decode_spec_chars(m.group(1)), m.start(0),
                m.end(0))
            tokens.append(token)
        #for m in inlcodes:
        #    # XXX break line in HTML is coded with paragraph styling and this linearizing
        #    # does not help
        #    tokentext = XHTMLSaxHandler._decode_spec_chars(m.group(1))
        #    tokentext = core.InlCodeToken.linearize(tokentext)
        #    token = core.InlCodeToken(tokentext, m.start(0), m.end(0))
        #    tokens.append(token)
        for m in blkcodes:
            tokentext = m.group(1).lstrip('\n').rstrip('\n ')
            tokentext = core.deltextindent(tokentext)
            token = core.BlkCodeToken(
                XHTMLSaxHandler._decode_spec_chars(tokentext), m.start(0),
                m.end(0))
            tokens.append(token)

        tokens.sort(key=lambda tok: tok.start)
        tokens.append(core.EndToken(None))
        return tokens
Ejemplo n.º 2
0
    def tokens(self, text):
        oosaxhandler = OOSaxHandler(style=self.style)
        # next conversion seems bug in python (why need bytes, not str?!)
        if type(text) is str:
            parsetext = core.strtobytes23(text)
        else:
            parsetext = text
        sax.parseString(parsetext, oosaxhandler)
        text = ''.join(oosaxhandler.text)

        _inlcode_re = r'@start_inline@(.*?)@end_inline@'
        _blkcode_re = r'@start_block@(.*?)@end_block@'

        inlcode_re = re.compile(_inlcode_re, re.MULTILINE)
        blkcode_re = re.compile(_blkcode_re, re.MULTILINE)

        cmds = core.Cmd.syntax.findtokens('cmddef', text)
        inlcodes = inlcode_re.finditer(text)
        blkcodes = blkcode_re.finditer(text)

        tokens = []

        for m in cmds:
            token = core.CmdToken(OOSaxHandler._decode_spec_chars(m.group(1)),
                                  m.start(0), m.end(0))
            tokens.append(token)
        for m in inlcodes:
            # XXX break line in OO is coded with paragraph styling and this linearizing
            # does not help
            tokentext = OOSaxHandler._decode_spec_chars(m.group(1))
            tokentext = core.InlCodeToken.linearize(tokentext)
            token = core.InlCodeToken(tokentext, m.start(0), m.end(0))
            tokens.append(token)
        for m in blkcodes:
            token = core.BlkCodeToken(
                OOSaxHandler._decode_spec_chars(m.group(1)), m.start(0),
                m.end(0))
            tokens.append(token)

        tokens.sort(key=lambda tok: tok.start)
        tokens.append(core.EndToken(None))
        #print [t.text for t in tokens]
        return tokens
Ejemplo n.º 3
0
    def tokens(self, text):
        xhtmlsaxhandler = XHTMLSaxHandler(styles=self.styles, tags=self.tags)
        xhtmlsaxerrhandler = XHTMLSaxErrorHandler(self, self.ignore)
        # next conversion seems bug in python (why need bytes, not str?!)
        if type(text) is str:
            parsetext = core.strtobytes23(text)
        else:
            parsetext = text
        sax.parseString(parsetext, xhtmlsaxhandler, xhtmlsaxerrhandler)
        text = ''.join(xhtmlsaxhandler.text)

        #_inlcode_re = r'@start_inline@(.*?)@end_inline@'
        _blkcode_re = r'@start_block@(.*?)@end_block@'

        #inlcode_re = re.compile(_inlcode_re, re.MULTILINE)
        blkcode_re = re.compile(_blkcode_re, re.DOTALL|re.MULTILINE)

        cmds = core.Cmd.syntax.findtokens('cmddef', text)
        #inlcodes = inlcode_re.finditer(text)
        blkcodes = blkcode_re.finditer(text)

        tokens = []

        for m in cmds:
            token = core.CmdToken(XHTMLSaxHandler._decode_spec_chars(m.group(1)), m.start(0), m.end(0))
            tokens.append(token)
        #for m in inlcodes:
        #    # XXX break line in HTML is coded with paragraph styling and this linearizing
        #    # does not help
        #    tokentext = XHTMLSaxHandler._decode_spec_chars(m.group(1))
        #    tokentext = core.InlCodeToken.linearize(tokentext)
        #    token = core.InlCodeToken(tokentext, m.start(0), m.end(0))
        #    tokens.append(token)
        for m in blkcodes:
            tokentext = m.group(1).lstrip('\n').rstrip('\n ')
            tokentext = core.deltextindent(tokentext)
            token = core.BlkCodeToken(XHTMLSaxHandler._decode_spec_chars(tokentext), m.start(0), m.end(0))
            tokens.append(token)

        tokens.sort(key=lambda tok: tok.start)
        tokens.append(core.EndToken(None))
        return tokens
Ejemplo n.º 4
0
    def tokens(self, text):
        oosaxhandler = OOSaxHandler(style=self.style)
        # next conversion seems bug in python (why need bytes, not str?!)
        if type(text) is str:
            parsetext = core.strtobytes23(text)
        else:
            parsetext = text
        sax.parseString(parsetext, oosaxhandler)
        text = ''.join(oosaxhandler.text)

        _inlcode_re = r'@start_inline@(.*?)@end_inline@'
        _blkcode_re = r'@start_block@(.*?)@end_block@'

        inlcode_re = re.compile(_inlcode_re, re.MULTILINE)
        blkcode_re = re.compile(_blkcode_re, re.MULTILINE)

        cmds = core.Cmd.syntax.findtokens('cmddef', text)
        inlcodes = inlcode_re.finditer(text)
        blkcodes = blkcode_re.finditer(text)

        tokens = []

        for m in cmds:
            token = core.CmdToken(OOSaxHandler._decode_spec_chars(m.group(1)), m.start(0), m.end(0))
            tokens.append(token)
        for m in inlcodes:
            # XXX break line in OO is coded with paragraph styling and this linearizing
            # does not help
            tokentext = OOSaxHandler._decode_spec_chars(m.group(1))
            tokentext = core.InlCodeToken.linearize(tokentext)
            token = core.InlCodeToken(tokentext, m.start(0), m.end(0))
            tokens.append(token)
        for m in blkcodes:
            token = core.BlkCodeToken(OOSaxHandler._decode_spec_chars(m.group(1)), m.start(0), m.end(0))
            tokens.append(token)

        tokens.sort(key=lambda tok: tok.start)
        tokens.append(core.EndToken(None))
        #print [t.text for t in tokens]
        return tokens