Exemple #1
0
def write_fillable_pdf(file, output_pdf_path, data_dict):
    out_dir = os.path.join(BASE_DIR,
                           "tmp/" + str(random.randrange(20, 200, 3)) + ".pdf")
    INVOICE_TEMPLATE_PATH = os.path.join(BASE_DIR, file)
    input_stream = open(INVOICE_TEMPLATE_PATH, "rb")
    pdf_reader = PyPDF2.PdfFileReader(input_stream, strict=False)
    if "/AcroForm" in pdf_reader.trailer["/Root"]:
        pdf_reader.trailer["/Root"]["/AcroForm"].update(
            {NameObject("/NeedAppearances"): BooleanObject(True)})
    pdf_writer = PyPDF2.PdfFileWriter()
    set_need_appearances_writer(pdf_writer)
    if "/AcroForm" in pdf_writer._root_object:
        # Acro form is form field, set needs appearances to fix printing issues
        pdf_writer._root_object["/AcroForm"].update(
            {NameObject("/NeedAppearances"): BooleanObject(True)})
    pdf_writer.addPage(pdf_reader.getPage(0))
    page = pdf_writer.getPage(0)
    pdf_writer.updatePageFormFieldValues(page, data_dict)
    for j in range(0, len(page['/Annots'])):
        writer_annot = page['/Annots'][j].getObject()
        for field in data_dict:
            # -----------------------------------------------------BOOYAH!
            if writer_annot.get('/T') == field:
                writer_annot.update({NameObject("/Ff"): NumberObject(1)})
            # -----------------------------------------------------
    output_stream = BytesIO()
    pdf_writer.write(output_stream)
    with open(out_dir, 'wb') as d:  ## Open temporary file as bytes
        d.write(output_stream.read())
    input_stream.close()
    return out_dir
def create_annot_box(x1, y1, x2, y2, meta, color=[1, 0, 0]):
    new_annot = DictionaryObject()

    new_annot.update({
        # NameObject("/P"): parent,
        NameObject("/F"):
        NumberObject(4),
        NameObject("/Type"):
        NameObject("/Annot"),
        NameObject("/Subtype"):
        NameObject("/Square"),
        NameObject("/T"):
        TextStringObject(meta["author"]),
        NameObject("/Contents"):
        TextStringObject(meta["contents"]),
        NameObject("/C"):
        ArrayObject([FloatObject(c) for c in color]),
        NameObject("/Rect"):
        ArrayObject([
            FloatObject(x1),
            FloatObject(y1),
            FloatObject(x2),
            FloatObject(y2)
        ]),
    })
    return new_annot
Exemple #3
0
def generatePdf(infile, outfile):
    pdf = PdfFileReader(open(infile, "rb"), strict=False)
    if "/AcroForm" in pdf.trailer["/Root"]:
        pdf.trailer["/Root"]["/AcroForm"].update(
            {NameObject("/NeedAppearances"): BooleanObject(True)})

    pdf2 = PdfFileWriter()
    updateFormProperlyWriter(pdf2)
    if "/AcroForm" in pdf2._root_object:
        pdf2._root_object["/AcroForm"].update(
            {NameObject("/NeedAppearances"): BooleanObject(True)})

    # Add pages
    for x in range(pdf.getNumPages() - 1):
        pdf2.addPage(pdf.getPage(x))
        pdf2.updatePageFormFieldValues(pdf2.getPage(x), FORM_VALUES)

    # Flatten form fields.
    flat_page = pdf2.getPage(0)
    for j in range(0, len(flat_page['/Annots'])):
        writer_annot = flat_page['/Annots'][j].getObject()
        for field in FLATTEN_VALUES:
            if writer_annot.get('/T') == field:
                writer_annot.update({
                    NameObject("/Ff"): NumberObject(1)  # make ReadOnly
                })

    outputStream = open(outfile, "wb")
    pdf2.write(outputStream)
def createHighlight(x0, y0, x1, y1, color=[0, 0, 0]):
    newHighlight = DictionaryObject()

    newHighlight.update({
        NameObject("/F"):
        NumberObject(4),
        NameObject("/Type"):
        NameObject("/Annot"),
        NameObject("/Subtype"):
        NameObject("/Highlight"),
        NameObject("/C"):
        ArrayObject([FloatObject(c) for c in color]),
        NameObject("/Rect"):
        ArrayObject([
            FloatObject(x0),
            FloatObject(y0),
            FloatObject(x1),
            FloatObject(y1)
        ]),
        NameObject("/QuadPoints"):
        ArrayObject([
            FloatObject(x0),
            FloatObject(y1),
            FloatObject(x1),
            FloatObject(y1),
            FloatObject(x0),
            FloatObject(y0),
            FloatObject(x1),
            FloatObject(y0)
        ]),
    })

    return newHighlight
Exemple #5
0
    def fill(self, input_dict: Dict[str, Any]) -> None:
        for p, m in zip(self.input_pdf.pages, self.mapping):
            if "/Annots" not in p:
                self.pdf.addPage(p)
                continue
            for j in range(0, len(p["/Annots"])):
                writer_annot = p["/Annots"][j].getObject()
                writer_annot.update({NameObject("/Ff"): NumberObject(1)})  # make ReadOnly

                for mk, mv in m.items():
                    if writer_annot.get("/T") == mk:
                        input_value: Any = input_dict[mv]
                        value: str
                        if type(input_value) == bool:
                            if input_value:
                                writer_annot.update(
                                    {NameObject("/V"): NameObject("/1"), NameObject("/AS"): NameObject("/1")}
                                )
                            else:
                                if "/V" in writer_annot:
                                    del writer_annot["/V"]
                                writer_annot.update({NameObject("/AS"): NameObject("/Off")})
                        else:
                            value = str(input_value)
                            writer_annot.update(
                                {NameObject("/V"): TextStringObject(value), NameObject("/AP"): TextStringObject(value)}
                            )

            self.pdf.addPage(p)
Exemple #6
0
 def __init__(self, im):
     super().__init__()
     try:
         depth, colorspace = MODE_TO_COLORSPACE[im.mode]
     except KeyError:
         raise NotImplementedError('image mode %r not supported' % im.mode)
     w, h = im.size
     # always compress raw image data
     self._data = FlateDecode.encode(im.tobytes())
     self[NameObject("/Filter")] = NameObject('/FlateDecode')
     self[NameObject('/Type')] = NameObject('/XObject')
     self[NameObject('/Subtype')] = NameObject('/Image')
     self[NameObject('/Width')] = NumberObject(w)
     self[NameObject('/Height')] = NumberObject(h)
     self[NameObject('/BitsPerComponent')] = NumberObject(depth)
     self[NameObject('/ColorSpace')] = NameObject(colorspace)
Exemple #7
0
    def createHighlight(self,x1, y1, x2, y2, meta, color = [1, 0, 0]):
        newHighlight = DictionaryObject()

        newHighlight.update({
            NameObject("/F"): NumberObject(4),
            NameObject("/Type"): NameObject("/Annot"),
            NameObject("/Subtype"): NameObject("/Highlight"),

            NameObject("/T"): TextStringObject(meta["author"]),
            NameObject("/Contents"): TextStringObject(meta["contents"]),

            NameObject("/C"): ArrayObject([FloatObject(c) for c in color]),
            NameObject("/Rect"): ArrayObject([
                FloatObject(x1),
                FloatObject(y1),
                FloatObject(x2),
                FloatObject(y2)
            ]),
            NameObject("/QuadPoints"): ArrayObject([
                FloatObject(x1),
                FloatObject(y2),
                FloatObject(x2),
                FloatObject(y2),
                FloatObject(x1),
                FloatObject(y1),
                FloatObject(x2),
                FloatObject(y1)
            ]),
        })

        return newHighlight
Exemple #8
0
def test_CCITTFaxDecode():
    data = b""
    parameters = DictionaryObject({
        "/K": NumberObject(-1),
        "/Columns": NumberObject(17)
    })

    # This was just the result PyPDF2 1.27.9 returned.
    # It would be awesome if we could check if that is actually correct.
    assert CCITTFaxDecode.decode(data, parameters) == (
        b"II*\x00\x08\x00\x00\x00\x08\x00\x00\x01\x04\x00\x01\x00\x00\x00\x11\x00"
        b"\x00\x00\x01\x01\x04\x00\x01\x00\x00\x00\x00\x00\x00\x00\x02\x01"
        b"\x03\x00\x01\x00\x00\x00\x01\x00\x00\x00\x03\x01\x03\x00\x01\x00"
        b"\x00\x00\x04\x00\x00\x00\x06\x01\x03\x00\x01\x00\x00\x00\x00\x00"
        b"\x00\x00\x11\x01\x04\x00\x01\x00\x00\x00l\x00\x00\x00\x16\x01"
        b"\x04\x00\x01\x00\x00\x00\x00\x00\x00\x00\x17\x01\x04\x00\x01\x00"
        b"\x00\x00\x00\x00\x00\x00\x00\x00")
def pdf(request, template):
    #template = r'C:\Users\Mathi\Documents\Coding\PDF_Templates\Test_Contract.pdf'  # location of the pdf template

    outfile = r'C:\Users\Mathi\Documents\Coding\PDF_Templates\templates/test.pdf'  # location of the filled in pdf

    input_stream = open(
        template, "rb"
    )  # opens the template for reading in binary mode and returns it as a file object

    # PyPDF2 class that takes a file object or path to file (test), strict determines whether user should be warned of
    # all problems and also causes some correctable problems to be fatal. Initialises the PdfFileReader object.
    pdf_reader = PyPDF2.PdfFileReader(input_stream, strict=False)

    # Trailer is where all the file's metadata is stored, in a pdf the AcroForm contains the annotation fields
    # NeedAppearances needs to be true to enable the modification and setting of field value
    if "/AcroForm" in pdf_reader.trailer["/Root"]:
        pdf_reader.trailer["/Root"]["/AcroForm"].update(
            {NameObject("/NeedAppearances"): BooleanObject(True)})

    # We create a blank pdf page that will be writen
    pdf_writer = PyPDF2.PdfFileWriter()
    set_need_appearances_writer(pdf_writer)
    if "/AcroForm" in pdf_writer._root_object:
        # Acro form is form field, set needs appearances to fix printing issues
        pdf_writer._root_object["/AcroForm"].update(
            {NameObject("/NeedAppearances"): BooleanObject(True)})

    data_dict = {
        'numero_de_contrat#0': '12345\n',
        'numero_de_contrat#1': '12345\n',
        'Raison_Sociale': 'Dunder Mifflen\n',
        'Adresse': '1 paper drive, Paris\n',
        'SIREN': '1 paper drive, Paris\n',
        'Tel': '06.95.97.02.30\n',
    }

    # Create new page in this pdf we are writingr
    pdf_writer.addPage(pdf_reader.getPage(0))
    page = pdf_writer.getPage(0)
    pdf_writer.updatePageFormFieldValues(page, data_dict)
    for j in range(0, len(page['/Annots'])):
        writer_annot = page['/Annots'][j].getObject()
        for field in data_dict:
            # -----------------------------------------------------BOOYAH!
            if writer_annot.get('/T') == field:
                writer_annot.update({NameObject("/Ff"): NumberObject(1)})
            # -----------------------------------------------------
    output_stream = BytesIO()
    pdf_writer.write(output_stream)

    response = HttpResponse(output_stream.getvalue(),
                            content_type='application/pdf')
    response['Content-Disposition'] = 'inline; filename="completed.pdf"'
    input_stream.close()

    return FileResponse(output_stream, as_attachment=True, filename='test.pdf')
Exemple #10
0
def createHighlight(bbox=(0, 0, 1, 1),
                    contents="",
                    color=[1, 1, 0],
                    author="iwasakishuto(@cabernet_rock)"):
    """Create a Highlight

    Args:
        bbox (tuple)   : a bounding box showing the location of highlight.
        contents (str) : Text comments for a highlight label.
        color (list)   : Highlight color. Defaults to ``[1,1,0]``. (yellow)
        author (str)   : Who wrote the annotation (comment). Defaults to ``"iwasakishuto(@cabernet_rock)"`` .

    Returns:
        DictionaryObject: Highlight information.

    Examples:
        >>> from gummy.utils import createHighlight, addHighlightToPage
        >>> from PyPDF2 import PdfFileWriter, PdfFileReader
        >>> page_no = 0
        >>> pdfOutput = PdfFileWriter()
        >>> with open("input.pdf", mode="rb") as inPdf:
        ...     pdfInput = PdfFileReader(inPdf)
        ...     page = pdfInput.getPage(page_no)
        ...     highlight = createHighlight(bbox=(10,10,90,90), contents="COMMENT", color=(1,1,0))
        ...     addHighlightToPage(highlight, page, pdfOutput)
        ...     pdfOutput.addPage(page)
        ...     with open("output.pdf", mode="wb") as outPdf:
        ...         pdfOutput.write(outPdf)
    """
    from PyPDF2.generic import (DictionaryObject, NumberObject, FloatObject,
                                NameObject, TextStringObject, ArrayObject)
    x1, y1, x2, y2 = bbox
    newHighlight = DictionaryObject()
    newHighlight.update({
        NameObject("/F"):
        NumberObject(4),
        NameObject("/Type"):
        NameObject("/Annot"),
        NameObject("/Subtype"):
        NameObject("/Highlight"),
        NameObject("/T"):
        TextStringObject(author),
        NameObject("/Contents"):
        TextStringObject(contents),
        NameObject("/C"):
        ArrayObject([FloatObject(c) for c in color]),
        NameObject("/Rect"):
        ArrayObject([FloatObject(e) for e in bbox]),
        NameObject("/QuadPoints"):
        ArrayObject([FloatObject(e)
                     for e in [x1, y2, x2, y2, x1, y1, x2, y1]]),
    })
    return newHighlight
Exemple #11
0
def pdf(request):
    template = os.path.join(BASE_DIR, "family1.pdf")

    outfile = os.path.join(BASE_DIR, "sample.pdf")

    input_stream = open(template, "rb")
    pdf_reader = PyPDF2.PdfFileReader(input_stream, strict=False)
    if "/AcroForm" in pdf_reader.trailer["/Root"]:
        pdf_reader.trailer["/Root"]["/AcroForm"].update(
            {NameObject("/NeedAppearances"): BooleanObject(True)})

    pdf_writer = PyPDF2.PdfFileWriter()
    set_need_appearances_writer(pdf_writer)
    if "/AcroForm" in pdf_writer._root_object:
        # Acro form is form field, set needs appearances to fix printing issues
        pdf_writer._root_object["/AcroForm"].update(
            {NameObject("/NeedAppearances"): BooleanObject(True)})

    data_dict = {
        'Fid': 'John\n',
        'Fname1': 'Smith\n',
        'Faadhar': '[email protected]\n',
        'Fcontact': '889-998-9967\n',
        'Faddress1': 'Amazing Inc.\n',
        'Faddress2': 'Dev\n',
        'Faddress3': '123 Main Way\n',
        'Fration': 'Johannesburg\n',
        'Farogya': 'New Mexico\n',
        'Faadhar1': 96705,
        'from_date': 'USA\n',
        'to_date': 'Who cares...\n'
    }

    pdf_writer.addPage(pdf_reader.getPage(0))
    page = pdf_writer.getPage(0)
    pdf_writer.updatePageFormFieldValues(page, data_dict)
    for j in range(0, len(page['/Annots'])):
        writer_annot = page['/Annots'][j].getObject()
        for field in data_dict:
            # -----------------------------------------------------BOOYAH!
            if writer_annot.get('/T') == field:
                writer_annot.update({NameObject("/Ff"): NumberObject(1)})
            # -----------------------------------------------------
    output_stream = BytesIO()
    pdf_writer.write(output_stream)

    response = HttpResponse(output_stream.getvalue(),
                            content_type='application/pdf')
    response['Content-Disposition'] = 'inline; filename="completed.pdf"'
    input_stream.close()

    return response
    def create_highlight(self, x1, y1, x2, y2, meta, color=[0, 1, 0]):
        """
        Create a highlight for a PDF.

        Parameters
        ----------
        x1, y1 : float
            bottom left corner
        x2, y2 : float
            top right corner
        meta : dict
            keys are "author" and "contents"
        color : iterable
            Three elements, (r,g,b)
        """
        new_highlight = DictionaryObject()

        new_highlight.update({
            NameObject("/F"):
            NumberObject(4),
            NameObject("/Type"):
            NameObject("/Annot"),
            NameObject("/Subtype"):
            NameObject("/Highlight"),
            NameObject("/T"):
            TextStringObject(meta["author"]),
            NameObject("/Contents"):
            TextStringObject(meta["contents"]),
            NameObject("/C"):
            ArrayObject([FloatObject(c) for c in color]),
            NameObject("/Rect"):
            ArrayObject([
                FloatObject(x1),
                FloatObject(y1),
                FloatObject(x2),
                FloatObject(y2)
            ]),
            NameObject("/QuadPoints"):
            ArrayObject([
                FloatObject(x1),
                FloatObject(y2),
                FloatObject(x2),
                FloatObject(y2),
                FloatObject(x1),
                FloatObject(y1),
                FloatObject(x2),
                FloatObject(y1)
            ]),
        })

        return new_highlight
Exemple #13
0
def generatereport(field_dictionary, cl):
    def set_need_appearances_writer(writer):
        try:
            catalog = writer._root_object
            if "/AcroForm" not in catalog:
                writer._root_object.update({
                    NameObject("/AcroForm"):
                    IndirectObject(len(writer._objects), 0, writer)
                })

            need_appearances = NameObject("/NeedAppearances")
            writer._root_object["/AcroForm"][need_appearances] = BooleanObject(
                True)
            return writer

        except Exception as e:
            print('set_need_appearances_writer() catch : ', repr(e))
            return writer

    outfile = cl + ".pdf"
    infile = cwd + '/templates/report.pdf'
    inputStream = open(infile, "rb")
    pdf_reader = PyPDF2.PdfFileReader(inputStream, strict=False)
    if "/AcroForm" in pdf_reader.trailer["/Root"]:
        pdf_reader.trailer["/Root"]["/AcroForm"].update(
            {NameObject("/NeedAppearances"): BooleanObject(True)})

    pdf_writer = PyPDF2.PdfFileWriter()
    set_need_appearances_writer(pdf_writer)
    if "/AcroForm" in pdf_writer._root_object:
        pdf_writer._root_object["/AcroForm"].update(
            {NameObject("/NeedAppearances"): BooleanObject(True)})

    pdf_writer.addPage(pdf_reader.getPage(0))
    pdf_writer.updatePageFormFieldValues(pdf_writer.getPage(0),
                                         field_dictionary)
    page = pdf_writer.getPage(0)
    for j in range(0, len(page['/Annots'])):
        writer_annot = page['/Annots'][j].getObject()
        for field in field_dictionary:
            if writer_annot.get('/T') == field:
                writer_annot.update({
                    NameObject("/Ff"): NumberObject(1)  # make ReadOnly
                })
    outputStream = open(outfile, "wb")
    pdf_writer.write(outputStream)
    inputStream.close()
    outputStream.close()
Exemple #14
0
    def _create_highlight(self,
                          x0,
                          y0,
                          width,
                          height,
                          comment,
                          author='',
                          color=[0, 0, 0, 0]):
        self.add_rect(x0, y0, width, height)
        highlight = DictionaryObject()

        highlight.update({
            NameObject("/F"):
            NumberObject(4),
            NameObject("/Type"):
            NameObject("/Annot"),
            NameObject("/Subtype"):
            NameObject("/Highlight"),
            NameObject("/T"):
            TextStringObject(author),
            NameObject("/Contents"):
            TextStringObject(comment),
            NameObject("/C"):
            ArrayObject([FloatObject(c) for c in color]),
            NameObject("/Rect"):
            ArrayObject([
                FloatObject(x0),
                FloatObject(y0),
                FloatObject(x0 + width),
                FloatObject(y0 + width)
            ]),
            NameObject("/QuadPoints"):
            ArrayObject([
                FloatObject(x0),
                FloatObject(y0 + width),
                FloatObject(x0 + width),
                FloatObject(y0 + width),
                FloatObject(x0),
                FloatObject(y0),
                FloatObject(x0 + width),
                FloatObject(y0)
            ]),
        })

        return highlight
Exemple #15
0
def update_form_values(infile, outfile, newvals=None):
    pdf = PdfFileReader(open(infile, 'rb'))
    writer = PdfFileWriter()
    set_need_appearances_writer(writer)

    if "/AcroForm" in writer._root_object:
        writer._root_object["/AcroForm"].update(
            {NameObject("/NeedAppearances"): BooleanObject(True)})

    # print(pdf)

    # if '/AcroForm' in pdf._root_object:
    #     pdf._root_object["/AcroForm"].update(
    #         {NameObject("/NeedAppearances"): BooleanObject(True)}
    #     )

    for i in range(pdf.getNumPages()):
        page = pdf.getPage(i)

        if not newvals:
            newvals = {
                k: f'#{i} {k}={v}'
                for i, (k, v) in enumerate(get_form_fields(infile).items())
            }

        try:
            writer.updatePageFormFieldValues(page, newvals)

            for j in range(0, len(page['/Annots'])):
                writer_annot = page['/Annots'][j].getObject()
                for field in newvals:
                    # -----------------------------------------------------BOOYAH!
                    if writer_annot.get('/T') == field:
                        writer_annot.update(
                            {NameObject("/Ff"): NumberObject(1)})
                        # -----------------------------------------------------

            writer.addPage(page)
        except Exception as e:
            print(repr(e))
            writer.addPage(page)

    with open(outfile, 'wb') as out:
        writer.write(out)
def createHighlight(x1, y1, x2, y2, meta, color=[1, 0, 0]):
    '''
	Create a highlight object which will be applied to a box in a PDF page (please,
	notice that coordinates start in the bottom left) with specific metadata and
	colors.
	'''
    newHighlight = DictionaryObject()

    newHighlight.update({
        NameObject("/F"):
        NumberObject(4),
        NameObject("/Type"):
        NameObject("/Annot"),
        NameObject("/Subtype"):
        NameObject("/Highlight"),
        NameObject("/T"):
        TextStringObject(meta["author"]),
        NameObject("/Contents"):
        TextStringObject(meta["contents"]),
        NameObject("/C"):
        ArrayObject([FloatObject(c) for c in color]),
        NameObject("/Rect"):
        ArrayObject([
            FloatObject(x1),
            FloatObject(y1),
            FloatObject(x2),
            FloatObject(y2)
        ]),
        NameObject("/QuadPoints"):
        ArrayObject([
            FloatObject(x1),
            FloatObject(y2),
            FloatObject(x2),
            FloatObject(y2),
            FloatObject(x1),
            FloatObject(y1),
            FloatObject(x2),
            FloatObject(y1)
        ]),
    })
    return newHighlight
Exemple #17
0
def pdf_flatten(filename, number):
    # open the pdf
    input_stream = open(filename, "rb")
    pdf_reader = PyPDF2.PdfFileReader(input_stream, strict=False)
    if "/AcroForm" in pdf_reader.trailer["/Root"]:
        pdf_reader.trailer["/Root"]["/AcroForm"].update(
            {NameObject("/NeedAppearances"): BooleanObject(True)})

    pdf_writer = PyPDF2.PdfFileWriter()
    set_need_appearances_writer(pdf_writer)
    if "/AcroForm" in pdf_writer._root_object:
        # Acro form is form field, set needs appearances to fix printing issues
        pdf_writer._root_object["/AcroForm"].update(
            {NameObject("/NeedAppearances"): BooleanObject(True)})

    data_dict = pdf_reader.getFields()  # this is a dict of your DB form values

    flatten_dict = ("number")

    for thisPage in range(pdf_reader.numPages):
        pdf_writer.addPage(pdf_reader.getPage(thisPage))
        page = pdf_writer.getPage(thisPage)
        for j in range(0, len(page['/Annots'])):
            writer_annot = page['/Annots'][j].getObject()
            for field in data_dict:
                if writer_annot.get('/T') == field:
                    if field in flatten_dict:
                        writer_annot.update({
                            NameObject("/Ff"):
                            NumberObject(1)  # make ReadOnly
                            ,
                            NameObject("/V"):
                            TextStringObject(number)  # update the value
                        })

    flatten_form = filename + "-flatten.pdf"
    output_stream = open(flatten_form, "wb")
    pdf_writer.write(output_stream)
    return flatten_form
def create_annotation(x, y, meta):
    color = [255.0 / 255.0, 209 / 255.0, 0]
    # link
    linkAnnotation = DictionaryObject()
    # https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf
    linkAnnotation.update({
        # Table 165 NoZoom
        NameObject("/F"):
        NumberObject(4),
        NameObject("/Type"):
        NameObject("/Annot"),
        NameObject("/Subtype"):
        NameObject("/Link"),

        # Table 164 color, annotation rectangle
        NameObject("/C"):
        ArrayObject([FloatObject(c) for c in color]),
        NameObject("/Rect"):
        ArrayObject([
            FloatObject(x),
            FloatObject(y),
            FloatObject(x + 20),
            FloatObject(y + 20)
        ]),

        # Table 173 link annotation
        NameObject('/A'):
        DictionaryObject({
            # Table 206 uri
            NameObject('/S'): NameObject('/URI'),
            NameObject('/URI'): TextStringObject(meta["contents"])
        }),
        # Table 173 invert rect when mouse
        NameObject('/H'):
        NameObject('/I'),
        # table 164 hor corner radius, vert corner radius, border width
        # dash array table 56
        NameObject('/Border'):
        ArrayObject([
            NameObject(0),
            NameObject(0),
            NameObject(5),
        ]),
    })

    commentAnnotation = DictionaryObject()
    # https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf
    commentAnnotation.update({
        # Table 165 NoZoom
        NameObject("/F"):
        NumberObject(4),
        NameObject("/Type"):
        NameObject("/Annot"),
        NameObject("/Subtype"):
        NameObject("/Text"),

        # Table 170 titlebar
        NameObject("/T"):
        TextStringObject(meta["author"]),
        NameObject("/Contents"):
        TextStringObject(meta["contents"]),

        # Table 164 color, annotation rectangle
        NameObject("/C"):
        ArrayObject([FloatObject(c) for c in color]),
        NameObject("/Rect"):
        ArrayObject([
            FloatObject(x),
            FloatObject(y),
            FloatObject(x + 5),
            FloatObject(y + 5)
        ]),

        # 12.5.6.4 text annotation
        NameObject('/Open'):
        BooleanObject(False),
        NameObject('/Name'):
        NameObject('/Comment'),
    })

    return linkAnnotation, commentAnnotation
def main():

    print("Loading metadata and eText information...")

    with open("bookinfo.json", 'r') as bookInfoRequest:
        str_response = bookInfoRequest.read()
        bookInfo = json.loads(str_response)
        bookInfo = bookInfo[0]['userBookTOList'][0]

    with open("pageinfo.json", 'r') as pageInfoRequest:
        pageInfo = json.loads(pageInfoRequest.read())
        pageInfo = pageInfo[0]['pdfPlayerPageInfoTOList']

    with open("pages.json", 'r') as file:
        downloadedData = json.loads(file.read())[0]["pdfPlayerPageInfoTOList"]

    def get_data(page_id):
        b = next((x['data'] for x in downloadedData if x['pageID'] == page_id), None)
        return bytearray(base64.standard_b64decode(b[len("data:application/pdf;base64,"):]))

    with tempfile.TemporaryDirectory() as pdfDownloadDir:
        # Use a temporary directory to download all the pdf files to
        # First, download the cover file
        pdfPageTable = {}

        pdf_page_label_table = {}

        # urllib.request.urlretrieve(getPageUrl(bookInfo['pdfCoverArt'], isCover="Y"), os.path.join(pdfDownloadDir, "0000 - cover.pdf"))
        with open(os.path.join(pdfDownloadDir, "0000 - cover.pdf"), 'w+b') as ous:
            ous.write(get_data(pageInfo[0]['pageID']))

        # Then, download all the individual pages for the e-book
        def download(pdfPage):
            pdfPageTable[pdfPage['bookPageNumber']] = pdfPage['pageOrder']
            savePath = os.path.join(pdfDownloadDir,
                                    "{:04} - {}.pdf".format(pdfPage['pageOrder'], pdfPage['bookPageNumber']))
            with open(savePath, 'w+b') as out:
                out.write(get_data(pdfPage['pageID']))
            # urllib.request.urlretrieve(getPageUrl(pdfPage['pdfPath']), savePath)

        threadPool = ThreadPool(40)  # 40 threads should download a book fairly quickly
        print("Reading pages from pageinfo.json to \"{}\"...".format(pdfDownloadDir))
        threadPool.map(download, pageInfo)

        print("Assembling PDF...")

        # Begin to assemble the final PDF, first by adding all the pages
        fileMerger = PdfFileWriter()
        for pdfFile in sorted(os.listdir(pdfDownloadDir)):
            page = PdfFileReader(os.path.join(pdfDownloadDir, pdfFile)).getPage(0)
            os.remove(os.path.join(pdfDownloadDir, pdfFile))  # Save on memory a bit
            fileMerger.addPage(page)

        bookmarksExist = True

        # TODO: Bookmarks currently not supported
        with open("bookmarks.json", 'r') as bookmarkInfoRequest:
            try:
                bookmarkInfo = json.loads(bookmarkInfoRequest.read())
                bookmarkInfo = bookmarkInfo[0]['basketsInfoTOList'][0]
            except Exception as e:
                bookmarksExist = False

        def recursiveSetBookmarks(aDict, parent=None):
            if isinstance(aDict, dict):
                aDict = [aDict]
            for bookmark in aDict:
                # These are the main bookmarks under this parent (or the whole document if parent is None)
                bookmarkName = bookmark['name']  # Name of the section
                pageNum = str(bookmark['linkvalue']['content'])  # First page (in the pdf's format)

                latestBookmark = fileMerger.addBookmark(bookmarkName, pdfPageTable[pageNum], parent)

                if 'basketentry' in bookmark:
                    recursiveSetBookmarks(bookmark['basketentry'], latestBookmark)

        if bookmarksExist:
            print("Adding bookmarks...")
            fileMerger.addBookmark("Cover", 0) # Add a bookmark to the cover at the beginning
            recursiveSetBookmarks(bookmarkInfo['document'][0]['basketcollection']['basket']['basketentry'])
        else:
            print("Bookmarks don't exist for book")
        print("Fixing metadata...")
        # Hack to fix metadata and page numbers:
        pdf_page_label_table = [(v, k) for k, v in pdfPageTable.items()]
        pdf_page_label_table = sorted(pdf_page_label_table, key=(lambda x: int(x[0])))
        labels = ArrayObject([
            NameObject(0), DictionaryObject({NameObject("/P"): NameObject("(cover)")})
        ])
        last_mode = None
        last_prefix = ""
        # Now we check to see the ranges where we have roman numerals or arabic numerals
        # The following code is not ideal for this, so I'd appreciate a PR with a better solution
        for pageNumber, pageLabel in pdf_page_label_table:
            curr_mode = None
            prefix = ""
            style = DictionaryObject()
            if arabicRegex.match(pageLabel):
                curr_mode = "arabic"
                prefix = arabicRegex.match(pageLabel).group("prefix")
                style.update({NameObject("/S"): NameObject("/D")})
            elif romanRegex.match(pageLabel):
                curr_mode = "roman"
                prefix = romanRegex.match(pageLabel).group("prefix")
                style.update({NameObject("/S"): NameObject("/r")})
            if curr_mode != last_mode or prefix != last_prefix:
                if prefix:
                    style.update({
                        NameObject("/P"): NameObject("({})".format(prefix))
                    })
                labels.extend([
                    NumberObject(pageNumber),
                    style,
                ])
                last_mode = curr_mode
                last_prefix = prefix
        root_obj = fileMerger._root_object
        # Todo: Fix the weird page numbering bug
        pageLabels = DictionaryObject()
        # fileMerger._addObject(pageLabels)
        pageLabels.update({
            NameObject("/Nums"): ArrayObject(labels)
        })
        root_obj.update({
            NameObject("/PageLabels"): pageLabels
        })

        print("Writing PDF...")
        with open("{}.pdf".format(bookInfo['title']).replace("/", "").replace(":", "_"), "wb") as outFile:
            fileMerger.write(outFile)
def main(eTextUrl):
    bookData = urllib.parse.parse_qs(eTextUrl.split("?")[-1])
    if (bookData.get("values", None)) is not None:
        bookData = {
            itemName : [itemValue] for itemName, itemValue in
            zip(*[iter(bookData["values"][0].split("::"))]*2)
        }
        # A few fixes in terms of capitalization
        bookData["bookid"] = bookData["bookID"]
        bookData["userid"] = bookData["userID"]
        bookData["sessionid"] = bookData["sessionID"]

        # We'll default to the roletypeid for a student
        bookData["roletypeid"] = [roletypeid] # 3 for Instructor... the server doesn't care, though


    print("Downloading metadata and eText information...")

    bookInfoGetUrl = bookInfoUrl.format(bookData["bookid"][0])
    #print(hsidUrl(bookInfoGetUrl))
    with urllib.request.urlopen(hsidUrl(bookInfoGetUrl)) as bookInfoRequest:
        str_response = bookInfoRequest.read().decode('utf-8')
        bookInfo = json.loads(str_response)
        bookInfo = bookInfo[0]['userBookTOList'][0]

    pageInfoGetUrl = pageInfoUrl.format(
        userid=bookData['userid'][0],
        userroleid=bookData['roletypeid'][0],
        bookid=bookData['bookid'][0],
        bookeditionid=bookInfo['bookEditionID'],
        authkey=bookData['sessionid'][0],
        )
    with urllib.request.urlopen(hsidUrl(pageInfoGetUrl)) as pageInfoRequest:
        pageInfo = json.loads(pageInfoRequest.read().decode('utf-8'))
        pageInfo = pageInfo[0]['pdfPlayerPageInfoTOList']

    def getPageUrl(pdfPage, isCover="N"):
        pdfPage = pdfPage.replace("/assets/","")
        getPage = pagePath = pdfUrl.format(
            bookid=bookInfo['globalBookID'],
            pdfpage=pdfPage,
            iscover=isCover,
            authkey=bookData['sessionid'][0]
        )
        return hsidUrl(getPage)


    with tempfile.TemporaryDirectory() as pdfDownloadDir:
        # Use a temporary directory to download all the pdf files to
        # First, download the cover file
        pdfPageTable = {}

        pdfPageLabelTable = {}

        urllib.request.urlretrieve(getPageUrl(bookInfo['pdfCoverArt'], isCover="Y"), os.path.join(pdfDownloadDir, "0000 - cover.pdf"))
        # Then, download all the individual pages for the e-book
        def download(pdfPage):
            pdfPageTable[pdfPage['bookPageNumber']] = pdfPage['pageOrder']
            savePath = os.path.join(pdfDownloadDir, "{:04} - {}.pdf".format(pdfPage['pageOrder'], pdfPage['bookPageNumber']))
            urllib.request.urlretrieve(getPageUrl(pdfPage['pdfPath']), savePath)

        threadPool = ThreadPool(40) # 40 threads should download a book fairly quickly
        print("Downloading pages to \"{}\"...".format(pdfDownloadDir))
        threadPool.map(download, pageInfo)

        print("Assembling PDF...")

        # Begin to assemble the final PDF, first by adding all the pages
        fileMerger = PdfFileWriter()
        for pdfFile in sorted(os.listdir(pdfDownloadDir)):
            fileMerger.addPage(PdfFileReader(os.path.join(pdfDownloadDir, pdfFile)).getPage(0))

        # And then add all the bookmarks to the final PDF
        bookmarkInfoGetUrl = bookmarkInfoUrl.format(
            userroleid=bookData['roletypeid'][0],
            bookid=bookData['bookid'][0],
            language=language,
            authkey=bookData['sessionid'][0],
            bookeditionid=bookInfo['bookEditionID'],
            scenarioid=bookData['scenario'][0],
            )
        with urllib.request.urlopen(hsidUrl(bookmarkInfoGetUrl)) as bookmarkInfoRequest:
            bookmarkInfo = json.loads(bookmarkInfoRequest.read().decode('utf-8'))
            bookmarkInfo = bookmarkInfo[0]['basketsInfoTOList'][0]

        fileMerger.addBookmark("Cover", 0) # Add a bookmark to the cover at the beginning
        print("Fixing metadata...")
        # Hack to fix metadata and page numbers:
        pdfPageLabelTable = [(v,k) for k,v in pdfPageTable.items()]
        pdfPageLabelTable = sorted(pdfPageLabelTable, key=(lambda x: int(x[0])))
        labels = ArrayObject([
            NameObject(0), DictionaryObject({NameObject("/P"): NameObject("(cover)")})
        ])
        lastMode = None
        lastPrefix = ""
        # Now we check to see the ranges where we have roman numerals or arabic numerals
        # The following code is not ideal for this, so I'd appreciate a PR with a better solution
        for pageNumber, pageLabel in pdfPageLabelTable:
            currMode = None
            prefix = ""
            style = DictionaryObject()
            if arabicRegex.match(pageLabel):
                currMode = "arabic"
                prefix = arabicRegex.match(pageLabel).group("prefix")
                style.update({NameObject("/S"): NameObject("/D")})
            elif romanRegex.match(pageLabel):
                currMode = "roman"
                prefix = romanRegex.match(pageLabel).group("prefix")
                style.update({NameObject("/S"): NameObject("/r")})
            if currMode != lastMode or prefix != lastPrefix:
                if prefix:
                    style.update({
                        NameObject("/P"): NameObject("({})".format(prefix))
                    })
                labels.extend([
                    NumberObject(pageNumber),
                    style,
                ])
                lastMode = currMode
                lastPrefix = prefix
        rootObj = fileMerger._root_object
        # Todo: Fix the weird page numbering bug
        pageLabels = DictionaryObject()
        #fileMerger._addObject(pageLabels)
        pageLabels.update({
            NameObject("/Nums"): ArrayObject(labels)
        })
        rootObj.update({
            NameObject("/PageLabels"): pageLabels
        })

        print("Writing PDF...")
        with open("out.pdf", "wb") as outFile:
            fileMerger.write(outFile)
Exemple #21
0
def main(bookId):
    if bookId.startswith("http"):
        print("Trying to extract bookId from url")
        bookData = urllib.parse.parse_qs(bookId.split("?")[-1])
        if (bookData.get("values", None)) is not None:
            bookData = {
                itemName: [itemValue]
                for itemName, itemValue in zip(
                    *[iter(bookData["values"][0].split("::"))] * 2)
            }
            # Fix capitalization
            bookData["bookid"] = bookData["bookID"]
        bookId = bookData["bookid"][0]

    bookId = int(bookId)
    print(
        "Downloading book id {}. Please open an issue on GitHub if this book id is incorrect."
        .format(bookId))

    print("Downloading metadata and eText information...")

    bookInfoGetUrl = bookInfoUrl.format(bookId)
    #print(hsidUrl(bookInfoGetUrl))
    with urllib.request.urlopen(hsidUrl(bookInfoGetUrl)) as bookInfoRequest:
        str_response = bookInfoRequest.read().decode('utf-8')
        bookInfo = json.loads(str_response)
        bookInfo = bookInfo[0]['userBookTOList'][0]

    pageInfoGetUrl = pageInfoUrl.format(
        userroleid=roletypeid,
        bookid=bookId,
        bookeditionid=bookInfo['bookEditionID'])
    with urllib.request.urlopen(hsidUrl(pageInfoGetUrl)) as pageInfoRequest:
        pageInfo = json.loads(pageInfoRequest.read().decode('utf-8'))
        pageInfo = pageInfo[0]['pdfPlayerPageInfoTOList']

    def getPageUrl(pdfPage, isCover="N"):
        pdfPage = pdfPage.replace("/assets/", "")
        getPage = pagePath = pdfUrl.format(bookid=bookInfo['globalBookID'],
                                           pdfpage=pdfPage,
                                           iscover=isCover)
        return hsidUrl(getPage)

    with tempfile.TemporaryDirectory() as pdfDownloadDir:
        # Use a temporary directory to download all the pdf files to
        # First, download the cover file
        pdfPageTable = {}

        pdfPageLabelTable = {}

        urllib.request.urlretrieve(
            getPageUrl(bookInfo['pdfCoverArt'], isCover="Y"),
            os.path.join(pdfDownloadDir, "0000 - cover.pdf"))

        # Then, download all the individual pages for the e-book
        def download(pdfPage):
            pdfPageTable[pdfPage['bookPageNumber']] = pdfPage['pageOrder']
            savePath = os.path.join(
                pdfDownloadDir,
                "{:04} - {}.pdf".format(pdfPage['pageOrder'],
                                        pdfPage['bookPageNumber']))
            urllib.request.urlretrieve(getPageUrl(pdfPage['pdfPath']),
                                       savePath)

        threadPool = ThreadPool(
            40)  # 40 threads should download a book fairly quickly
        print("Downloading pages to \"{}\"...".format(pdfDownloadDir))
        threadPool.map(download, pageInfo)

        print("Assembling PDF...")

        # Begin to assemble the final PDF, first by adding all the pages
        fileMerger = PdfFileWriter()
        for pdfFile in sorted(os.listdir(pdfDownloadDir)):
            fileMerger.addPage(
                PdfFileReader(os.path.join(pdfDownloadDir,
                                           pdfFile)).getPage(0))

        # And then add all the bookmarks to the final PDF
        bookmarkInfoGetUrl = bookmarkInfoUrl.format(
            userroleid=roletypeid,
            bookid=bookId,
            language=language,
            bookeditionid=bookInfo['bookEditionID'],
            scenarioid=1001)

        bookmarksExist = True

        with urllib.request.urlopen(
                hsidUrl(bookmarkInfoGetUrl)) as bookmarkInfoRequest:
            try:
                bookmarkInfo = json.loads(
                    bookmarkInfoRequest.read().decode('utf-8'))
                bookmarkInfo = bookmarkInfo[0]['basketsInfoTOList'][0]
            except Exception as e:
                bookmarksExist = False

        def recursiveSetBookmarks(aDict, parent=None):
            if isinstance(aDict, dict):
                aDict = [aDict]
            for bookmark in aDict:
                # These are the main bookmarks under this parent (or the whole document if parent is None)
                bookmarkName = bookmark['n']  # Name of the section
                pageNum = str(bookmark['lv']
                              ['content'])  # First page (in the pdf's format)

                latestBookmark = fileMerger.addBookmark(
                    bookmarkName, pdfPageTable[pageNum], parent)

                if 'be' in bookmark:
                    recursiveSetBookmarks(bookmark['be'], latestBookmark)

        if bookmarksExist:
            print("Adding bookmarks...")
            fileMerger.addBookmark(
                "Cover", 0)  # Add a bookmark to the cover at the beginning
            recursiveSetBookmarks(bookmarkInfo['document'][0]['bc']['b']['be'])
        else:
            print("Bookmarks don't exist for ID {}".format(bookId))
        print("Fixing metadata...")
        # Hack to fix metadata and page numbers:
        pdfPageLabelTable = [(v, k) for k, v in pdfPageTable.items()]
        pdfPageLabelTable = sorted(pdfPageLabelTable,
                                   key=(lambda x: int(x[0])))
        labels = ArrayObject([
            NameObject(0),
            DictionaryObject({NameObject("/P"): NameObject("(cover)")})
        ])
        lastMode = None
        lastPrefix = ""
        # Now we check to see the ranges where we have roman numerals or arabic numerals
        # The following code is not ideal for this, so I'd appreciate a PR with a better solution
        for pageNumber, pageLabel in pdfPageLabelTable:
            currMode = None
            prefix = ""
            style = DictionaryObject()
            if arabicRegex.match(pageLabel):
                currMode = "arabic"
                prefix = arabicRegex.match(pageLabel).group("prefix")
                style.update({NameObject("/S"): NameObject("/D")})
            elif romanRegex.match(pageLabel):
                currMode = "roman"
                prefix = romanRegex.match(pageLabel).group("prefix")
                style.update({NameObject("/S"): NameObject("/r")})
            if currMode != lastMode or prefix != lastPrefix:
                if prefix:
                    style.update(
                        {NameObject("/P"): NameObject("({})".format(prefix))})
                labels.extend([
                    NumberObject(pageNumber),
                    style,
                ])
                lastMode = currMode
                lastPrefix = prefix
        rootObj = fileMerger._root_object
        # Todo: Fix the weird page numbering bug
        pageLabels = DictionaryObject()
        #fileMerger._addObject(pageLabels)
        pageLabels.update({NameObject("/Nums"): ArrayObject(labels)})
        rootObj.update({NameObject("/PageLabels"): pageLabels})

        print("Writing PDF...")
        with open(
                "{} - {}.pdf".format(bookId, bookInfo['title']).replace(
                    "/", "").replace(":", "_"), "wb") as outFile:
            fileMerger.write(outFile)
Exemple #22
0
class PdfEnhancedFileWriter(PdfFileWriter):

    colors_operands = {
        'rgb': {
            'black': [NumberObject(0),
                      NumberObject(0),
                      NumberObject(0)],
            'white': [NumberObject(1),
                      NumberObject(1),
                      NumberObject(1)],
        },
        'cmyk': {
            'black': [
                NumberObject(0),
                NumberObject(0),
                NumberObject(0),
                NumberObject(1)
            ],
            'white': [
                NumberObject(0),
                NumberObject(0),
                NumberObject(0),
                NumberObject(0)
            ],
        },
        'grayscale': {
            'black': [NumberObject(0)],
            'white': [NumberObject(1)],
        }
    }

    def _getOperatorType(self, operator):
        operator_types = {
            b_('Tj'): 'text',
            b_("'"): 'text',
            b_('"'): 'text',
            b_("TJ"): 'text',
            b_('rg'): 'rgb',  # color
            b_('RG'): 'rgb',  # color
            b_('k'): 'cmyk',  # color
            b_('K'): 'cmyk',  # color
            b_('g'): 'grayscale',  # color
            b_('G'): 'grayscale',  # color
            b_('re'): 'rectangle',
            b_('l'): 'line',  # line
            b_('m'): 'line',  # start line
            b_('S'): 'line',  # stroke(paint) line
        }

        if operator in operator_types:
            return operator_types[operator]

        return None

    # get the operation type that the color affects on
    def _getColorTargetOperationType(self, color_index, operations):

        for i in range(color_index + 1, len(operations)):
            operator = operations[i][1]

            operator_type = self._getOperatorType(operator)

            if operator_type == 'text' or operator_type == 'rectangle' or operator_type == 'line':
                return operator_type

        return False

    def getMinimumRectangleWidth(self, fontSize, minimumNumberOfLetters=1.5):
        return fontSize * minimumNumberOfLetters

    def removeWordStyle(self, ignoreByteStringObject=False):
        """
        Removes imported styles from Word - Path Constructors rectangles - from this output.

        :param bool ignoreByteStringObject: optional parameter
            to ignore ByteString Objects.
        """

        pages = self.getObject(self._pages)['/Kids']
        for j in range(len(pages)):
            page = pages[j]
            pageRef = self.getObject(page)
            content = pageRef['/Contents'].getObject()

            if not isinstance(content, ContentStream):
                content = ContentStream(content, pageRef)

            _operations = []
            last_font_size = 0

            for operator_index, (operands,
                                 operator) in enumerate(content.operations):

                if operator == b_('Tf') and operands[0][:2] == '/F':
                    last_font_size = operands[1].as_numeric()

                if operator == b_('Tj'):
                    text = operands[0]
                    if ignoreByteStringObject:
                        if not isinstance(text, TextStringObject):
                            operands[0] = TextStringObject()
                elif operator == b_("'"):
                    text = operands[0]
                    if ignoreByteStringObject:
                        if not isinstance(text, TextStringObject):
                            operands[0] = TextStringObject()
                elif operator == b_('"'):
                    text = operands[2]
                    if ignoreByteStringObject:
                        if not isinstance(text, TextStringObject):
                            operands[2] = TextStringObject()
                elif operator == b_("TJ"):
                    for i in range(len(operands[0])):
                        if ignoreByteStringObject:
                            if not isinstance(operands[0][i],
                                              TextStringObject):
                                operands[0][i] = TextStringObject()

                operator_type = self._getOperatorType(operator)

                # we are ignoring all grayscale colors
                # tests showed that black underlines, borders and tables are defined by grayscale and arn't using rgb/cmyk colors
                if operator_type == 'rgb' or operator_type == 'cmyk':

                    color_target_operation_type = self._getColorTargetOperationType(
                        operator_index, content.operations)

                    new_color = None

                    # we are coloring all text in black and all rectangles in white
                    # removing all colors paints rectangles in black which gives us unwanted results
                    if color_target_operation_type == 'text':
                        new_color = 'black'
                    elif color_target_operation_type == 'rectangle':
                        new_color = 'white'

                    if new_color:
                        operands = self.colors_operands[operator_type][
                            new_color]

                # remove styled rectangles (highlights, lines, etc.)
                # the 're' operator is a Path Construction operator, creates a rectangle()
                # presumably, that's the way word embedding all of it's graphics into a PDF when creating one
                if operator == b_('re'):

                    rectangle_width = operands[-2].as_numeric()
                    rectangle_height = operands[-1].as_numeric()

                    minWidth = self.getMinimumRectangleWidth(
                        last_font_size,
                        1)  # (length of X letters at the current size)
                    maxHeight = last_font_size + 6  # range to catch really big highlights
                    minHeight = 1.5  # so that thin lines will not be removed

                    # remove only style that:
                    # it's width are bigger than the minimum
                    # it's height is smaller than maximum and larger than minimum
                    if rectangle_width > minWidth and rectangle_height > minHeight and rectangle_height <= maxHeight:
                        continue

                _operations.append((operands, operator))

            content.operations = _operations
            pageRef.__setitem__(NameObject('/Contents'), content)
Exemple #23
0
    def cloneDocumentFromReader(self, reader: PdfFileReader, *args) -> None:
        """Create a copy (clone) of a document from a PDF file reader.

        :param reader: PDF file reader instance from which the clone
            should be created.
        :callback after_page_append (function): Callback function that is invoked after
            each page is appended to the writer. Signature includes a reference to the
            appended page (delegates to appendPagesFromReader). Callback signature:

            :param writer_pageref (PDF page reference): Reference to the page just
                appended to the document.
        """
        mustAddTogether = False
        newInfoRef = self._info
        oldPagesRef = self._pages
        oldPages = self.getObject(self._pages)

        # If there have already been any number of pages added

        if oldPages[NameObject("/Count")] > 0:
            # Keep them
            mustAddTogether = True
        else:
            # Through the page object out
            if oldPages in self._objects:
                newInfoRef = self._pages
                self._objects.remove(oldPages)

        # Clone the reader's root document
        self.cloneReaderDocumentRoot(reader)
        if not self._root:
            self._root = self._addObject(self._root_object)

        # Sweep for all indirect references
        externalReferenceMap = {}
        self.stack = []
        newRootRef = self._sweepIndirectReferences(externalReferenceMap,
                                                   self._root)

        # Delete the stack to reset
        del self.stack

        # Clean-Up Time!!!
        # Get the new root of the PDF
        realRoot = self.getObject(newRootRef)

        # Get the new pages tree root and its ID Number
        tmpPages = realRoot[NameObject("/Pages")]
        newIdNumForPages = 1 + self._objects.index(tmpPages)

        # Make an IndirectObject just for the new Pages
        self._pages = IndirectObject(newIdNumForPages, 0, self)

        # If there are any pages to add back in
        if mustAddTogether:
            # Set the new page's root's parent to the old
            # page's root's reference
            tmpPages[NameObject("/Parent")] = oldPagesRef
            # Add the reference to the new page's root in
            # the old page's kids array

            newPagesRef = self._pages
            oldPages[NameObject("/Kids")].append(newPagesRef)
            # Set all references to the root of the old/new
            # page's root
            self._pages = oldPagesRef
            realRoot[NameObject("/Pages")] = oldPagesRef
            # Update the count attribute of the page's root
            oldPages[NameObject("/Count")] = \
                NumberObject(oldPages[NameObject("/Count")] + tmpPages[NameObject("/Count")])
        else:
            # Bump up the info's reference b/c the old
            # page's tree was bumped off
            self._info = newInfoRef
Exemple #24
0
def add_geospatial_pdf_header(m, f, f2, map_bounds, poly, epsg=None, wkt=None):
    """
        Adds geospatial PDF information to the PDF file as per:
            Adobe® Supplement to the ISO 32000 PDF specification
            BaseVersion: 1.7
            ExtensionLevel: 3
            (June 2008)
        Notes:
            The epsg code or the wkt text of the projection must be provided.
            Must be called *after* the page has had .finish() called.
        """
    if not HAS_PYPDF2:
        raise RuntimeError(
            "PyPDF2 not available; PyPDF2 required to add geospatial header to PDF"
        )

    if not any((epsg, wkt)):
        raise RuntimeError(
            "EPSG or WKT required to add geospatial header to PDF")

    file_reader = PdfFileReader(f)
    file_writer = PdfFileWriter()

    # preserve OCProperties at document root if we have one
    if NameObject('/OCProperties'
                  ) in file_reader.trailer['/Root']:  #Python3-friendly
        file_writer._root_object[NameObject(
            '/OCProperties')] = file_reader.trailer['/Root'].getObject()[
                NameObject('/OCProperties')]

    for page in file_reader.pages:
        gcs = DictionaryObject()
        gcs[NameObject('/Type')] = NameObject('/PROJCS')

        if epsg:
            gcs[NameObject('/EPSG')] = NumberObject(int(epsg))
        if wkt:
            gcs[NameObject('/WKT')] = TextStringObject(wkt)

        measure = get_pdf_measure(m, gcs, poly, map_bounds)
        """
            Returns the PDF's VP array.
            The VP entry is an array of viewport dictionaries. A viewport is basiscally
            a rectangular region on the PDF page. The only required entry is the BBox which
            specifies the location of the viewport on the page.
            """
        viewport = DictionaryObject()
        viewport[NameObject('/Type')] = NameObject('/Viewport')

        bbox = ArrayObject()
        for x in (0, int(page.mediaBox[3]), int(page.mediaBox[2]), 0):  #in pts
            bbox.append(FloatObject(str(x)))  #Fixed

        viewport[NameObject('/BBox')] = bbox
        #viewport[NameObject('/Name')] = TextStringObject('OOMAP')
        viewport[NameObject('/Measure')] = measure

        vp_array = ArrayObject()
        vp_array.append(viewport)
        page[NameObject('/VP')] = vp_array
        file_writer.addPage(page)

    file_writer.write(f2)
    return (f2)
Exemple #25
0
def test_number_object_exception():
    with pytest.raises(OverflowError):
        NumberObject(1.5 * 2**10000)
def concat_and_clean():
    bundle_cells = []
    bundle_document = None
    notebook_root = None

    for source_path_str in ipynb_files:
        source_path = Path(source_path_str)
        bundle_path = Path(build_directory, source_path.stem + '.html')

        if bundle_path.exists():
            rendered_file = open(str(bundle_path), 'r', encoding='utf-8')
            html_source = html.fromstring(rendered_file.read())
            rendered_file.close()

            os.remove(str(bundle_path))

            cells = html_source.find_class('cell')

            # If no main document body is available yet, grab it from the first
            # page that is found. Use that as the basis for the entire bundle
            if bundle_document == None:
                bundle_document = html_source
                body_node = bundle_document.xpath('//body')[0]

                cover_file = open('./cover.html', 'r', encoding='utf-8')
                cover_source = html.fromstring(cover_file.read())
                cover_file.close()

                cover_styles = cover_source.xpath('//style')[0]
                bundle_head = bundle_document.xpath('//head')[0]
                bundle_head.insert(2, cover_styles)

                cover_page = cover_source.get_element_by_id('cover-page')
                body_node.insert(0, cover_page)

                # Include the custom styles once, which would have otherwise be
                # included by the final two cells in every rendered notebook
                bundle_file = open(str(Path('styles', 'aipstyle.html')), 'r')
                custom_style_elements = html.fragments_fromstring(bundle_file.read())
                bundle_file.close()

                for element in reversed(custom_style_elements):
                    body_node.insert(0, element)

                # Even though the final two cells will be ignored below they
                # have to be actually removed from the 'base' document to make
                # sure they don't show up at the top of the notebook
                notebook_root = html_source.get_element_by_id('notebook-container')
                notebook_root.remove(cells[-2])
                notebook_root.remove(cells[-1])

            # The final two cells of every rendered notebook are expected to
            # contain the code loading custom CSS, these are not necessary for
            # the bundled version and should therefore be removed
            chapter_node = html.fromstring('<section></section>')
            notebook_root.append(chapter_node)
            for cell in cells[0:-2]:
                chapter_node.append(cell)

    shutil.copy('./styles/bundle.css', str(Path(build_directory, 'custom.css')))

    if bundle_document != None:
        bundle_filename = str(Path(build_directory, 'bundle.html'))
        bundle_file = open(bundle_filename, 'w', encoding='utf-8')
        bundle_file.write(html.tostring(bundle_document).decode('utf-8'))
        bundle_file.close()

        os.system('phantomjs --web-security=no ./print_to_pdf.js bundle/bundle.html')

        # The generated PDF file contains incorrect URIs for the table of
        # contents and the backlinks to the items in it. What used to be
        # 'internal' URIs in the HTML document have become 'external' URIs in
        # the PDF document. These have to be rewritten using a two-pass
        # mechanism. First the location of each of the links (both in the table
        # of contents as in the actual document) are collected. These should,
        # mostly, come in pairs. In the first pass, the location of each of
        # these links is recorded and stored with respect to the link it should
        # be linked from. Then in the second pass, all links are updated to
        # point to the correct page and offset on that page based on the
        # information gathered in the first pass.
        source_pdf = PdfFileReader('output/BMLIP-5SSD0.pdf')

        # Link dictionaries store links using their names as key with tuples
        # specifying their corresponding (page, ...) as values
        links = dict()

        # Collect pages and offsets for all internal links
        for page_number in range(0, source_pdf.getNumPages()):
            page = source_pdf.getPage(page_number)
            annotations = page['/Annots'].getObject()
            for annotation in annotations:
                object = annotation.getObject()
                link = object['/A']
                uri = link['/URI']
                if uri[0:7] == 'file://':
                    uri_parts = uri.split('#')
                    if len(uri_parts) > 1:
                        # This is an internal URI
                        if uri_parts[1][0:4] == 'toc-':
                            key = uri_parts[1][4:]
                            links[key] = (page_number, object['/Rect'][3])
                        else:
                            links['toc-' + uri_parts[1]] = (page_number, object['/Rect'][3])

        # Modify all links to point to the proper internal locations
        for page_number in range(0, source_pdf.getNumPages()):
            page = source_pdf.getPage(page_number)
            annotations = page['/Annots'].getObject()
            for annotation in annotations:
                object = annotation.getObject()
                link = object['/A']
                uri = link['/URI']
                if uri[0:7] == 'file://':
                    uri_parts = uri.split('#')
                    if len(uri_parts) > 1:
                        # Always remove the URI pointing to the non-existent
                        # external file
                        del link['/URI']

                        # Not all link targets actually exist in the document (such
                        # as those on the first page), these have to be ignored
                        if uri_parts[1] in links:
                            link_data = links[uri_parts[1]]
                            link.update({
                                NameObject('/D'): ArrayObject([NumberObject(link_data[0]), NameObject('/FitH'), NumberObject(link_data[1])]),
                                NameObject('/S'): NameObject('/GoTo')
                            })
                        else:
                            # Update the rectangle to effectively disable the link
                            object.update({
                                NameObject('/Rect'): ArrayObject([NumberObject(0), NumberObject(0), NumberObject(0), NumberObject(0)])
                            })

        target_pdf = PdfFileWriter()
        target_pdf.appendPagesFromReader(source_pdf)

        # Manually add page numbers to the table of contents
        toc_stream = BytesIO()
        toc_canvas = canvas.Canvas(toc_stream, pagesize = A4)
        current_page = 0

        # The first page is the cover page, and thus empty as far as the table of
        # contents is concerned
        def getPageNumber(toc_item):
            return links[toc_item][0]

        toc_links = [link for link in links if link[0:4] == 'toc-']
        for toc_link in sorted(toc_links, key = getPageNumber):
            link_data = links[toc_link]
            if link_data[0] > current_page:
                toc_canvas.showPage()
                current_page = links[toc_link][0]

            target_name = toc_link[4:]
            if target_name in links:
                target_data = links[target_name]
                toc_canvas.drawRightString(575, -10 + link_data[1], '%d' % (target_data[0] + 1))
        toc_canvas.save()

        toc_stream.seek(0)
        toc_pdf = PdfFileReader(toc_stream)
        for page_number in range(1, toc_pdf.getNumPages()):
            target_page = target_pdf.getPage(page_number)
            target_page.mergePage(toc_pdf.getPage(page_number))

        target_file = open('output/BMLIP-5SSD0.pdf', 'wb')
        target_pdf.write(target_file)
        target_file.close()

    shutil.rmtree(build_directory)
    # Acro form is form field, set needs appearances to fix printing issues
    pdf_writer._root_object["/AcroForm"].update(
        {NameObject("/NeedAppearances"): BooleanObject(True)})

data_dict = dict()  # this is a dict of your DB form values

pdf_writer.addPage(pdf_reader.getPage(0))
page = pdf_writer.getPage(0)
# update form fields
pdf_writer.updatePageFormFieldValues(page, data_dict)
for j in range(0, len(page['/Annots'])):
    writer_annot = page['/Annots'][j].getObject()
    for field in data_dict:
        if writer_annot.get('/T') == field:
            writer_annot.update({
                NameObject("/Ff"): NumberObject(1)  # make ReadOnly
            })
output_stream = BytesIO()
pdf_writer.write(output_stream)

# output_stream is your flattened PDF


def set_need_appearances_writer(writer):
    # basically used to ensured there are not
    # overlapping form fields, which makes printing hard
    try:
        catalog = writer._root_object
        # get the AcroForm tree and add "/NeedAppearances attribute
        if "/AcroForm" not in catalog:
            writer._root_object.update({
Exemple #28
0
def generate_student_report(a, b):
    def student_data(a, b):
        global dbhost, dbuser, dbpas
        with pymysql.connect(dbhost, dbuser, dbpas, 'exam') as db:
            db.execute(f"SELECT * FROM {b} WHERE name = '{a}'")
            res = db.fetchone()
        return res

    def set_need_appearances_writer(writer):
        try:
            catalog = writer._root_object
            if "/AcroForm" not in catalog:
                writer._root_object.update({
                    NameObject("/AcroForm"):
                    IndirectObject(len(writer._objects), 0, writer)
                })

            need_appearances = NameObject("/NeedAppearances")
            writer._root_object["/AcroForm"][need_appearances] = BooleanObject(
                True)
            return writer

        except Exception as e:
            print('set_need_appearances_writer() catch : ', repr(e))
            return writer

    def create_dict(a):  # Check totals
        data_dict['name'] = a[1]
        data_dict['class'] = get_class_student(a[1])
        data_dict['exam'] = rev_get_exam(b)
        data_dict['english_obt'] = a[2]
        data_dict['english_total'] = a[3]
        data_dict['english_percent'] = str((int(a[2]) / int(a[3])) * 100)[:4]
        data_dict['english_api'] = getapi(data_dict['english_percent'])
        data_dict['science_obt'] = a[4]
        data_dict['science_total'] = a[5]
        data_dict['science_percent'] = str((int(a[4]) / int(a[5])) * 100)[:4]
        data_dict['science_api'] = getapi(data_dict['science_percent'])
        data_dict['math_obt'] = a[6]
        data_dict['math_total'] = a[7]
        data_dict['math_percent'] = str((int(a[6]) / int(a[7])) * 100)[:4]
        data_dict['math_api'] = getapi(data_dict['math_percent'])
        data_dict['social_obt'] = a[8]
        data_dict['social_total'] = a[9]
        data_dict['social_percent'] = str((int(a[8]) / int(a[9])) * 100)[:4]
        data_dict['social_api'] = getapi(data_dict['social_percent'])
        data_dict['obt_total'] = a[-3]
        data_dict['total_total'] = a[-2]
        data_dict['percentage'] = a[-1]
        data_dict['total_api'] = data_dict['english_api'] + data_dict['science_api'] + \
                                 data_dict['math_api'] + data_dict['social_api']
        return data_dict

    x = student_data(a, b)
    data_dict = {}
    outfile = f'{cwd}/uploads/{a}.pdf'
    infile = f'{cwd}/templates/report2.pdf'
    data = create_dict(x)
    input_stream = open(infile, "rb")
    pdf_reader = PyPDF2.PdfFileReader(input_stream, strict=False)
    if "/AcroForm" in pdf_reader.trailer["/Root"]:
        pdf_reader.trailer["/Root"]["/AcroForm"].update(
            {NameObject("/NeedAppearances"): BooleanObject(True)})

    pdf_writer = PyPDF2.PdfFileWriter()
    set_need_appearances_writer(pdf_writer)
    if "/AcroForm" in pdf_writer._root_object:
        pdf_writer._root_object["/AcroForm"].update(
            {NameObject("/NeedAppearances"): BooleanObject(True)})

    pdf_writer.addPage(pdf_reader.getPage(0))
    pdf_writer.updatePageFormFieldValues(pdf_writer.getPage(0), data)
    page = pdf_writer.getPage(0)
    for j in range(0, len(page['/Annots'])):
        writer_annot = page['/Annots'][j].getObject()
        for field in data_dict:
            if writer_annot.get('/T') == field:
                writer_annot.update({
                    NameObject("/Ff"): NumberObject(1)  # make ReadOnly
                })
    output_stream = open(outfile, "wb")
    pdf_writer.write(output_stream)
    input_stream.close()
    output_stream.close()
    outfile = outfile.split('/')[-1]
    return outfile
Exemple #29
0
    def convert_to_pdfa(self):
        """
        Transform the opened PDF file into a PDF/A compliant file
        """
        # Set the PDF version to 1.7 (as PDF/A-3 is based on version 1.7) and make it PDF/A compliant.
        # See https://github.com/veraPDF/veraPDF-validation-profiles/wiki/PDFA-Parts-2-and-3-rules#rule-612-1

        # " The file header shall begin at byte zero and shall consist of "%PDF-1.n" followed by a single EOL marker,
        # where 'n' is a single digit number between 0 (30h) and 7 (37h) "
        # " The aforementioned EOL marker shall be immediately followed by a % (25h) character followed by at least four
        # bytes, each of whose encoded byte values shall have a decimal value greater than 127 "
        self._header = b"%PDF-1.7\n%\xFF\xFF\xFF\xFF"

        # Add a document ID to the trailer. This is only needed when using encryption with regular PDF, but is required
        # when using PDF/A
        pdf_id = ByteStringObject(md5(self._reader.stream.getvalue()).digest())
        # The first string is based on the content at the time of creating the file, while the second is based on the
        # content of the file when it was last updated. When creating a PDF, both are set to the same value.
        self._ID = ArrayObject((pdf_id, pdf_id))

        with file_open('tools/data/files/sRGB2014.icc', mode='rb') as icc_profile:
            icc_profile_file_data = compress(icc_profile.read())

        icc_profile_stream_obj = DecodedStreamObject()
        icc_profile_stream_obj.setData(icc_profile_file_data)
        icc_profile_stream_obj.update({
            NameObject("/Filter"): NameObject("/FlateDecode"),
            NameObject("/N"): NumberObject(3),
            NameObject("/Length"): NameObject(str(len(icc_profile_file_data))),
        })

        icc_profile_obj = self._addObject(icc_profile_stream_obj)

        output_intent_dict_obj = DictionaryObject()
        output_intent_dict_obj.update({
            NameObject("/S"): NameObject("/GTS_PDFA1"),
            NameObject("/OutputConditionIdentifier"): createStringObject("sRGB"),
            NameObject("/DestOutputProfile"): icc_profile_obj,
            NameObject("/Type"): NameObject("/OutputIntent"),
        })

        output_intent_obj = self._addObject(output_intent_dict_obj)
        self._root_object.update({
            NameObject("/OutputIntents"): ArrayObject([output_intent_obj]),
        })

        pages = self._root_object['/Pages']['/Kids']

        # PDF/A needs the glyphs width array embedded in the pdf to be consistent with the ones from the font file.
        # But it seems like it is not the case when exporting from wkhtmltopdf.
        if TTFont:
            fonts = {}
            # First browse through all the pages of the pdf file, to get a reference to all the fonts used in the PDF.
            for page in pages:
                for font in page.getObject()['/Resources']['/Font'].values():
                    for descendant in font.getObject()['/DescendantFonts']:
                        fonts[descendant.idnum] = descendant.getObject()

            # Then for each font, rewrite the width array with the information taken directly from the font file.
            # The new width are calculated such as width = round(1000 * font_glyph_width / font_units_per_em)
            # See: http://martin.hoppenheit.info/blog/2018/pdfa-validation-and-inconsistent-glyph-width-information/
            for font in fonts.values():
                font_file = font['/FontDescriptor']['/FontFile2']
                stream = io.BytesIO(decompress(font_file._data))
                ttfont = TTFont(stream)
                font_upm = ttfont['head'].unitsPerEm
                glyphs = ttfont.getGlyphSet()._hmtx.metrics
                glyph_widths = []
                for key, values in glyphs.items():
                    if key[:5] == 'glyph':
                        glyph_widths.append(NumberObject(round(1000.0 * values[0] / font_upm)))

                font[NameObject('/W')] = ArrayObject([NumberObject(1), ArrayObject(glyph_widths)])
                stream.close()
        else:
            _logger.warning('The fonttools package is not installed. Generated PDF may not be PDF/A compliant.')

        outlines = self._root_object['/Outlines'].getObject()
        outlines[NameObject('/Count')] = NumberObject(1)

        # Set odoo as producer
        self.addMetadata({
            '/Creator': "Odoo",
            '/Producer': "Odoo",
        })
        self.is_pdfa = True
Exemple #30
0
def make_page_fields_readonly(page):
    for j in range(0, len(page["/Annots"])):
        writer_annot = page["/Annots"][j].getObject()
        existing_flags = writer_annot.get("/Ff")
        if isinstance(existing_flags, NumberObject):
            writer_annot.update({NameObject("/Ff"): NumberObject(existing_flags | 1)})