Beispiel #1
0
def pisaParser(src, context, default_css="", xhtml=False, encoding=None, xml_output=None):
    """
    - Parse HTML and get miniDOM
    - Extract CSS informations, add default CSS, parse CSS
    - Handle the document DOM itself and build reportlab story
    - Return Context object
    """

    global CSSAttrCache
    CSSAttrCache = {}

    if xhtml:
        #TODO: XHTMLParser doesn't see to exist...
        parser = html5lib.XHTMLParser(tree=treebuilders.getTreeBuilder("dom"))
    else:
        parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"))

    if type(src) in StringTypes:
        if type(src) is unicode:
            # If an encoding was provided, do not change it.
            if not encoding:
                encoding = "utf-8"
            src = src.encode(encoding)
        src = pisaTempFile(src, capacity=context.capacity)

    # Test for the restrictions of html5lib
    if encoding:
        # Workaround for html5lib<0.11.1
        if hasattr(inputstream, "isValidEncoding"):
            if encoding.strip().lower() == "utf8":
                encoding = "utf-8"
            if not inputstream.isValidEncoding(encoding):
                log.error("%r is not a valid encoding e.g. 'utf8' is not valid but 'utf-8' is!", encoding)
        else:
            if inputstream.codecName(encoding) is None:
                log.error("%r is not a valid encoding", encoding)
    document = parser.parse(
        src,
        encoding=encoding)

    if xml_output:
        if encoding:
            xml_output.write(document.toprettyxml(encoding=encoding))
        else:
            xml_output.write(document.toprettyxml(encoding="utf8"))


    if default_css:
        context.addDefaultCSS(default_css)

    pisaPreLoop(document, context)
    #try:
    context.parseCSS()
    #except:
    #    context.cssText = DEFAULT_CSS
    #    context.parseCSS()
    # context.debug(9, pprint.pformat(context.css))

    pisaLoop(document, context)
    return context
Beispiel #2
0
def pisaParser(src, context, default_css="", xhtml=False, encoding=None, xml_output=None):
    """
    - Parse HTML and get miniDOM
    - Extract CSS informations, add default CSS, parse CSS
    - Handle the document DOM itself and build reportlab story
    - Return Context object
    """

    global CSSAttrCache
    CSSAttrCache = {}

    if xhtml:
        #TODO: XHTMLParser doesn't see to exist...
        parser = html5lib.XHTMLParser(tree=treebuilders.getTreeBuilder("dom"))
    else:
        parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"))

    if type(src) in StringTypes:
        if type(src) is unicode:
            # If an encoding was provided, do not change it.
            if not encoding:
                encoding = "utf-8"
            src = src.encode(encoding)
        src = pisaTempFile(src, capacity=context.capacity)

    # Test for the restrictions of html5lib
    if encoding:
        # Workaround for html5lib<0.11.1
        if hasattr(inputstream, "isValidEncoding"):
            if encoding.strip().lower() == "utf8":
                encoding = "utf-8"
            if not inputstream.isValidEncoding(encoding):
                log.error("%r is not a valid encoding e.g. 'utf8' is not valid but 'utf-8' is!", encoding)
        else:
            if inputstream.codecName(encoding) is None:
                log.error("%r is not a valid encoding", encoding)
    document = parser.parse(
        src,
        encoding=encoding)

    if xml_output:
        if encoding:
            xml_output.write(document.toprettyxml(encoding=encoding))
        else:
            xml_output.write(document.toprettyxml(encoding="utf8"))


    if default_css:
        context.addDefaultCSS(default_css)

    pisaPreLoop(document, context)
    #try:
    context.parseCSS()
    #except:
    #    context.cssText = DEFAULT_CSS
    #    context.parseCSS()
    # context.debug(9, pprint.pformat(context.css))

    pisaLoop(document, context)
    return context
Beispiel #3
0
 def test_unicode(self):
     """Asserts bytes generated by reportlab are returned"""
     src = pisaTempFile()
     value = b'%PDF-1.4\r\n%\x93\x8c\x8b\x9e ReportLab Generated PDF document http://www.reportlab.com'
     try:
         src.write(value)
     except UnicodeDecodeError as error:
         self.fail(error)
Beispiel #4
0
 def test_unicode(self):
     """Asserts bytes generated by reportlab are returned"""
     src = pisaTempFile()
     value = b'%PDF-1.4\r\n%\x93\x8c\x8b\x9e ReportLab Generated PDF document http://www.reportlab.com'
     try:
         src.write(value)
     except UnicodeDecodeError as error:
         self.fail(error)
Beispiel #5
0
def pisaErrorDocument(dest, c):
    out = pisaTempFile(capacity=c.capacity)
    out.write("<p style='background-color:red;'><strong>%d error(s) occured:</strong><p>" % c.err)
    for mode, line, msg, _ in c.log:
        if mode=="error":
            out.write("<pre>%s in line %d: %s</pre>" % (mode, line, cgi.escape(msg)))

    out.write("<p><strong>%d warning(s) occured:</strong><p>" % c.warn)
    for mode, line, msg, _ in c.log:
        if mode=="warning":
            out.write("<p>%s in line %d: %s</p>" % (mode, line, cgi.escape(msg)))

    return pisaDocument(out.getvalue(), dest, raise_exception=False)
    def join(self, file=None):
        output = PyPDF2.PdfFileWriter()
        for pdffile in self.files:
            input = PyPDF2.PdfFileReader(pdffile)
            for pageNumber in xrange(input.getNumPages()):
                output.addPage(input.getPage(pageNumber))

        if file is not None:
            output.write(file)
            return file
        out = pisaTempFile(capacity=self.capacity)
        output.write(out)
        return out.getvalue()
Beispiel #7
0
 def join(self, file=None):
     import pyPdf # TODO: Why is this in the middle of everything?
     if pyPdf:
         output = pyPdf.PdfFileWriter()
         for pdffile in self.files:
             input = pyPdf.PdfFileReader(pdffile)
             for pageNumber in xrange(input.getNumPages()):
                 output.addPage(input.getPage(pageNumber))
     if file is not None:
         output.write(file)
         return file
     out = pisaTempFile(capacity=self.capacity)
     output.write(out)
     return out.getvalue()
Beispiel #8
0
 def join(self, file=None):
     import pyPdf  # TODO: Why is this in the middle of everything?
     if pyPdf:
         output = pyPdf.PdfFileWriter()
         for pdffile in self.files:
             input = pyPdf.PdfFileReader(pdffile)
             for pageNumber in xrange(input.getNumPages()):
                 output.addPage(input.getPage(pageNumber))
     if file is not None:
         output.write(file)
         return file
     out = pisaTempFile(capacity=self.capacity)
     output.write(out)
     return out.getvalue()
Beispiel #9
0
def pisaDocument(src,
                 dest=None,
                 path=None,
                 link_callback=None,
                 debug=0,
                 default_css=None,
                 xhtml=False,
                 encoding=None,
                 xml_output=None,
                 raise_exception=True,
                 capacity=100 * 1024,
                 **kw):
    log.debug(
        "pisaDocument options:\n  src = %r\n  dest = %r\n  path = %r\n  link_callback = %r\n  xhtml = %r",
        src, dest, path, link_callback, xhtml)

    # Prepare simple context
    context = pisaContext(path, debug=debug, capacity=capacity)
    context.pathCallback = link_callback

    # Build story
    context = pisaStory(src,
                        path,
                        link_callback,
                        debug,
                        default_css,
                        xhtml,
                        encoding,
                        context=context,
                        xml_output=xml_output)

    # Buffer PDF into memory
    out = pisaTempFile(capacity=context.capacity)

    doc = PmlBaseDoc(out,
                     pagesize=context.pageSize,
                     author=context.meta["author"].strip(),
                     subject=context.meta["subject"].strip(),
                     keywords=[
                         x.strip()
                         for x in context.meta["keywords"].strip().split(",")
                         if x
                     ],
                     title=context.meta["title"].strip(),
                     showBoundary=0,
                     allowSplitting=1)

    # Prepare templates and their frames
    if "body" in context.templateList:
        body = context.templateList["body"]
        del context.templateList["body"]
    else:
        x, y, w, h = getBox("1cm 1cm -1cm -1cm", context.pageSize)
        body = PmlPageTemplate(id="body",
                               frames=[
                                   Frame(x,
                                         y,
                                         w,
                                         h,
                                         id="body",
                                         leftPadding=0,
                                         rightPadding=0,
                                         bottomPadding=0,
                                         topPadding=0)
                               ],
                               pagesize=context.pageSize)

    doc.addPageTemplates([body] + context.templateList.values())

    # Use multibuild e.g. if a TOC has to be created
    if context.multiBuild:
        doc.multiBuild(context.story)
    else:
        doc.build(context.story)

    # Add watermarks
    if pyPdf:
        for bgouter in context.pisaBackgroundList:
            # If we have at least one background, then lets do it
            if bgouter:
                istream = out

                output = pyPdf.PdfFileWriter()
                input1 = pyPdf.PdfFileReader(istream)
                ctr = 0
                # TODO: Why do we loop over the same list again?
                # see bgouter at line 137
                for bg in context.pisaBackgroundList:
                    page = input1.getPage(ctr)
                    if (bg and not bg.notFound()
                            and (bg.mimetype == "application/pdf")):
                        bginput = pyPdf.PdfFileReader(bg.getFile())
                        pagebg = bginput.getPage(0)
                        pagebg.mergePage(page)
                        page = pagebg
                    else:
                        log.warn(
                            context.warning("Background PDF %s doesn't exist.",
                                            bg))
                    output.addPage(page)
                    ctr += 1
                out = pisaTempFile(capacity=context.capacity)
                output.write(out)
                # data = sout.getvalue()
                # Found a background? So leave loop after first occurence
                break
    else:
        log.warn(context.warning("pyPDF not installed!"))

    # Get the resulting PDF and write it to the file object
    # passed from the caller

    if dest is None:
        # No output file was passed - Let's use a pisaTempFile
        dest = pisaTempFile(capacity=context.capacity)
    context.dest = dest

    data = out.getvalue(
    )  # TODO: That load all the tempfile in RAM - Why bother with a swapping tempfile then?
    context.dest.write(data)  # TODO: context.dest is a tempfile as well...

    return context
Beispiel #10
0
def pisaDocument(src, dest=None, path=None, link_callback=None, debug=0,
                 default_css=None, xhtml=False, encoding=None, xml_output=None,
                 raise_exception=True, capacity=100*1024, **kw):

    log.debug("pisaDocument options:\n  src = %r\n  dest = %r\n  path = %r\n  link_callback = %r\n  xhtml = %r",
        src,
        dest,
        path,
        link_callback,
        xhtml)

    # Prepare simple context
    context = pisaContext(path, debug=debug, capacity=capacity)
    context.pathCallback = link_callback

    # Build story
    context = pisaStory(src, path, link_callback, debug, default_css, xhtml,
                        encoding, context=context, xml_output=xml_output)

    # Buffer PDF into memory
    out = pisaTempFile(capacity=context.capacity)

    doc = PmlBaseDoc(
        out,
        pagesize=context.pageSize,
        author=context.meta["author"].strip(),
        subject=context.meta["subject"].strip(),
        keywords=[x.strip() for x in
                  context.meta["keywords"].strip().split(",") if x],
        title=context.meta["title"].strip(),
        showBoundary=0,
        allowSplitting=1)

    # Prepare templates and their frames
    if "body" in context.templateList:
        body = context.templateList["body"]
        del context.templateList["body"]
    else:
        x, y, w, h = getBox("1cm 1cm -1cm -1cm", context.pageSize)
        body = PmlPageTemplate(
            id="body",
            frames=[
                Frame(x, y, w, h,
                      id="body",
                      leftPadding=0,
                      rightPadding=0,
                      bottomPadding=0,
                      topPadding=0)],
            pagesize = context.pageSize)

    doc.addPageTemplates([body] + context.templateList.values())

    # Use multibuild e.g. if a TOC has to be created
    if context.multiBuild:
        doc.multiBuild(context.story)
    else:
        doc.build(context.story)

    # Add watermarks
    if pyPdf:
        for bgouter in context.pisaBackgroundList:
            # If we have at least one background, then lets do it
            if bgouter:
                istream = out

                output = pyPdf.PdfFileWriter()
                input1 = pyPdf.PdfFileReader(istream)
                ctr = 0
                # TODO: Why do we loop over the same list again?
                # see bgouter at line 137
                for bg in context.pisaBackgroundList:
                    page = input1.getPage(ctr)
                    if (bg and not bg.notFound()
                        and (bg.mimetype=="application/pdf")):
                        bginput = pyPdf.PdfFileReader(bg.getFile())
                        pagebg = bginput.getPage(0)
                        pagebg.mergePage(page)
                        page = pagebg
                    else:
                        log.warn(context.warning(
                                "Background PDF %s doesn't exist.", bg))
                    output.addPage(page)
                    ctr += 1
                out = pisaTempFile(capacity=context.capacity)
                output.write(out)
                # data = sout.getvalue()
                # Found a background? So leave loop after first occurence
                break
    else:
        log.warn(context.warning("pyPDF not installed!"))

    # Get the resulting PDF and write it to the file object
    # passed from the caller

    if dest is None:
        # No output file was passed - Let's use a pisaTempFile
        dest = pisaTempFile(capacity=context.capacity)
    context.dest = dest

    data = out.getvalue() # TODO: That load all the tempfile in RAM - Why bother with a swapping tempfile then?
    context.dest.write(data) # TODO: context.dest is a tempfile as well...

    return context
Beispiel #11
0
 def addFromString(self, data):
     self.files.append(pisaTempFile(data, capacity=self.capacity))
 def addFromString(self, data):
     self.files.append(pisaTempFile(data, capacity=self.capacity))
Beispiel #13
0
def pisaDocument(src,
                 dest=None,
                 path=None,
                 link_callback=None,
                 debug=0,
                 default_css=None,
                 xhtml=False,
                 encoding=None,
                 xml_output=None,
                 raise_exception=True,
                 capacity=100 * 1024,
                 context_meta=None,
                 **kw):
    log.debug(
        "pisaDocument options:\n  src = %r\n  dest = %r\n  path = %r\n  link_callback = %r\n  xhtml = %r\n  context_meta = %r",
        src, dest, path, link_callback, xhtml, context_meta)

    # Prepare simple context
    context = pisaContext(path, debug=debug, capacity=capacity)

    if context_meta is not None:
        context.meta.update(context_meta)

    context.pathCallback = link_callback

    # Build story
    context = pisaStory(src,
                        path,
                        link_callback,
                        debug,
                        default_css,
                        xhtml,
                        encoding,
                        context=context,
                        xml_output=xml_output)

    # Buffer PDF into memory
    out = io.BytesIO()
    doc = PmlBaseDoc(out,
                     pagesize=context.pageSize,
                     author=context.meta["author"].strip(),
                     subject=context.meta["subject"].strip(),
                     keywords=[
                         x.strip()
                         for x in context.meta["keywords"].strip().split(",")
                         if x
                     ],
                     title=context.meta["title"].strip(),
                     showBoundary=0,
                     allowSplitting=1)
    # Prepare templates and their frames
    multi_template_list = False
    if "body" in context.templateList:
        body = context.templateList["body"]
        del context.templateList["body"]
    else:
        x, y, w, h = getBox("1cm 1cm -1cm -1cm", context.pageSize)
        body = PmlPageTemplate(id="body",
                               frames=[
                                   Frame(x,
                                         y,
                                         w,
                                         h,
                                         id="body",
                                         leftPadding=0,
                                         rightPadding=0,
                                         bottomPadding=0,
                                         topPadding=0)
                               ],
                               pagesize=context.pageSize)

    ptl = build_grid_templates(doc, context)
    if ptl == []:
        doc.addPageTemplates([body] + list(context.templateList.values()))
    if ptl != []:
        if out_grid == []:
            doc.addPageTemplates(ptl)
        else:
            doc.addPageTemplates([body] + ptl)

    # Use multibuild e.g. if a TOC has to be created
    if context.multiBuild:
        doc.multiBuild(context.story)
    else:
        doc.build(context.story)

    # Add watermarks
    if PyPDF2:
        file_handler = None
        for bgouter in context.pisaBackgroundList:
            # If we have at least one background, then lets do it
            if bgouter:
                istream = out

                output = PyPDF2.PdfFileWriter()
                input1 = PyPDF2.PdfFileReader(istream)
                ctr = 0
                # TODO: Why do we loop over the same list again?
                # see bgouter at line 137
                for bg in context.pisaBackgroundList:
                    page = input1.getPage(ctr)
                    if (bg and not bg.notFound()
                            and (bg.mimetype == "application/pdf")):
                        file_handler = open(bg.uri, 'rb')
                        bginput = PyPDF2.PdfFileReader(file_handler)
                        pagebg = bginput.getPage(0)
                        pagebg.mergePage(page)
                        page = pagebg

                    # Todo: the else-statement doesn't make a lot of sense to me; it's just throwing warnings
                    #  on unittesting \tests. Probably we have to rewrite the whole "background-image" stuff
                    #  to deal with cases like:
                    #  Page1 .jpg background
                    #  Page1 .pdf background
                    #  Page1 .jpg background, Page2 no background
                    #  Page1 .pdf background, Page2 no background
                    #  Page1 .jpg background, Page2 .pdf background
                    #  Page1 .pdf background, Page2 .jpg background
                    #  etc.
                    #  Right now it's kind of confusing. (fbernhart)
                    # else:
                    #     log.warning(context.warning(
                    #         "Background PDF %s doesn't exist.", bg))

                    output.addPage(page)

                    ctr += 1
                out = pisaTempFile(capacity=context.capacity)
                output.write(out)
                if file_handler:
                    file_handler.close()
                # data = sout.getvalue()
                # Found a background? So leave loop after first occurence
                break
    else:
        log.warning(context.warning("PyPDF2 not installed!"))

    # Get the resulting PDF and write it to the file object
    # passed from the caller

    if dest is None:
        # No output file was passed - Let's use a pisaTempFile
        dest = io.BytesIO()
    context.dest = dest

    data = out.getvalue()
    context.dest.write(data)  # TODO: context.dest is a tempfile as well...

    return context
Beispiel #14
0
def pisaParser(src,
               context,
               default_css="",
               xhtml=False,
               encoding=None,
               xml_output=None):
    """
    - Parse HTML and get miniDOM
    - Extract CSS informations, add default CSS, parse CSS
    - Handle the document DOM itself and build reportlab story
    - Return Context object
    """

    global CSSAttrCache
    CSSAttrCache = {}

    if xhtml:
        # TODO: XHTMLParser doesn't seem to exist...
        parser = html5lib.XHTMLParser(tree=treebuilders.getTreeBuilder("dom"))
    else:
        parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"))
    parser_kwargs = {}
    if isinstance(src, six.text_type):
        # If an encoding was provided, do not change it.
        if not encoding:
            encoding = "utf-8"
        src = src.encode(encoding)
        src = pisaTempFile(src, capacity=context.capacity)
        # To pass the encoding used to convert the text_type src to binary_type
        # on to html5lib's parser to ensure proper decoding
        parser_kwargs['transport_encoding'] = encoding

    # # Test for the restrictions of html5lib
    # if encoding:
    #     # Workaround for html5lib<0.11.1
    #     if hasattr(inputstream, "isValidEncoding"):
    #         if encoding.strip().lower() == "utf8":
    #             encoding = "utf-8"
    #         if not inputstream.isValidEncoding(encoding):
    #             log.error("%r is not a valid encoding e.g. 'utf8' is not valid but 'utf-8' is!", encoding)
    #     else:
    #         if inputstream.codecName(encoding) is None:
    #             log.error("%r is not a valid encoding", encoding)
    document = parser.parse(src, **parser_kwargs)  # encoding=encoding)

    if xml_output:
        if encoding:
            xml_output.write(document.toprettyxml(encoding=encoding))
        else:
            xml_output.write(document.toprettyxml(encoding="utf8"))

    if default_css:
        context.addDefaultCSS(default_css)

    pisaPreLoop(document, context)
    # try:
    context.parseCSS()
    # except:
    #    context.cssText = DEFAULT_CSS
    #    context.parseCSS()
    # context.debug(9, pprint.pformat(context.css))

    pisaLoop(document, context)
    return context
Beispiel #15
0
def pisaDocument(
    src,
    dest = None,
    path = None,
    link_callback = None,
    debug = 0,
    show_error_as_pdf = False,
    default_css = None,
    xhtml = False,
    encoding = None,
    xml_output = None,
    raise_exception = True,
    capacity = 100 * 1024, # -1,
    **kw):

    c = None
    if show_error_as_pdf:
        raise_exception = False

    try:

        log.debug("pisaDocument options:\n  src = %r\n  dest = %r\n  path = %r\n  link_callback = %r\n  xhtml = %r",
            src,
            dest,
            path,
            link_callback,
            xhtml)

        # Prepare simple context
        c = pisaContext(path, debug=debug, capacity=capacity)
        c.pathCallback = link_callback

        if dest is None:
            dest = pisaTempFile(capacity=c.capacity)
        c.dest = dest

        # Build story
        c = pisaStory(src, path, link_callback, debug, default_css, xhtml, encoding, c=c, xml_output=xml_output)

        # Buffer PDF into memory
        out = pisaTempFile(capacity=c.capacity)

        doc = PmlBaseDoc(
            out,
            pagesize = c.pageSize,
            author = c.meta["author"].strip(),
            subject = c.meta["subject"].strip(),
            keywords = [x.strip() for x in c.meta["keywords"].strip().split(",") if x],
            title = c.meta["title"].strip(),
            showBoundary = 0,
            allowSplitting = 1)

        # XXX It is not possible to access PDF info, because it is private in canvas
        # doc.info.producer = "pisa <http://www.holtwick.it>"

        # Prepare templates and their frames
        if c.templateList.has_key("body"):
            body = c.templateList["body"]
            del c.templateList["body"]
        else:
            x, y, w, h = getBox("1cm 1cm -1cm -1cm", c.pageSize)
            body = PmlPageTemplate(
                id="body",
                frames=[
                    Frame(x, y, w, h,
                        id = "body",
                        leftPadding = 0,
                        rightPadding = 0,
                        bottomPadding = 0,
                        topPadding = 0)],
                pagesize = c.pageSize)

        # print body.frames

        # print [body] + c.templateList.values()
        doc.addPageTemplates([body] + c.templateList.values())

        doc._pisa_page_counter = 0
        def _page_counter(page_no):
            doc._pisa_page_counter += 1
        doc.setPageCallBack(_page_counter)
        
        # Use multibuild e.g. if a TOC has to be created
        if c.multiBuild:
            doc.multiBuild(c.story)
        else:
            doc.build(c.story)

        c._pisa_page_counter = doc._pisa_page_counter

        # Add watermarks
        if pyPdf:
            for bgouter in c.pisaBackgroundList:

                # If we have at least one background, then lets do it
                if bgouter:

                    istream = out
                    try:
                        output = pyPdf.PdfFileWriter()
                        input1 = pyPdf.PdfFileReader(istream)
                        ctr = 0
                        for bg in c.pisaBackgroundList:
                            page = input1.getPage(ctr)
                            if bg and not bg.notFound() and (bg.mimetype=="application/pdf"):
                                bginput = pyPdf.PdfFileReader(bg.getFile())
                                pagebg = bginput.getPage(0)
                                pagebg.mergePage(page)
                                page = pagebg
                            else:
                                log.warn(c.warning("Background PDF %s doesn't exist.", bg))
                            output.addPage(page)
                            ctr += 1
                        out = pisaTempFile(capacity=c.capacity)
                        output.write(out)
                        # data = sout.getvalue()
                    except Exception:
                        log.exception(c.error("pyPDF error"))
                        if raise_exception:
                            raise


                    # Found a background? So leave loop after first occurence
                    break
        else:
            log.warn(c.warning("pyPDF not installed!"))

        # In web frameworks for debugging purposes maybe an output of
        # errors in a PDF is preferred
        if show_error_as_pdf and c and c.err:
            return pisaErrorDocument(c.dest, c)

        # Get the resulting PDF and write it to the file object
        # passed from the caller
        data = out.getvalue()
        c.dest.write(data)
        c.dest.close()
            
    except: # TODO: Kill catch-all!
        # log.exception(c.error("Document error"))        
        log.exception("Document error")
        c.err += 1
        if raise_exception:
            raise

    if raise_exception and c.err:
        raise Exception("Errors occured, please see log files for more informations")

    return c