Example #1
0
def test_write_metadata():
    pdf_path = os.path.join(RESOURCE_ROOT, "crazyones.pdf")

    reader = PdfReader(pdf_path)
    writer = PdfWriter()

    for page in reader.pages:
        writer.add_page(page)

    metadata = reader.metadata
    writer.add_metadata(metadata)

    writer.add_metadata({"/Title": "The Crazy Ones"})

    # finally, write data to PyPDF2-output.pdf
    tmp_filename = "dont_commit_writer_added_metadata.pdf"
    with open(tmp_filename, "wb") as output_stream:
        writer.write(output_stream)

    # Check if the title was set
    reader = PdfReader(tmp_filename)
    metadata = reader.metadata
    assert metadata.get("/Title") == "The Crazy Ones"

    # Cleanup
    os.remove(tmp_filename)
Example #2
0
def test_writer_clone():
    src = os.path.join(RESOURCE_ROOT, "pdflatex-outline.pdf")

    reader = PdfReader(src)
    writer = PdfWriter()

    writer.clone_document_from_reader(reader)
    assert len(writer.pages) == 4
Example #3
0
def test_pdf_header():
    writer = PdfWriter()
    assert writer.pdf_header == b"%PDF-1.3"

    reader = PdfReader(os.path.join(RESOURCE_ROOT, "crazyones.pdf"))
    writer.add_page(reader.pages[0])
    assert writer.pdf_header == b"%PDF-1.5"

    writer.pdf_header = b"%PDF-1.6"
    assert writer.pdf_header == b"%PDF-1.6"
Example #4
0
def test_encrypt(use_128bit):
    reader = PdfReader(os.path.join(RESOURCE_ROOT, "form.pdf"))
    writer = PdfWriter()

    page = reader.pages[0]
    orig_text = page.extract_text()

    writer.add_page(page)
    writer.encrypt(user_pwd="userpwd",
                   owner_pwd="ownerpwd",
                   use_128bit=use_128bit)

    # write "output" to PyPDF2-output.pdf
    tmp_filename = "dont_commit_encrypted.pdf"
    with open(tmp_filename, "wb") as output_stream:
        writer.write(output_stream)

    with open(tmp_filename, "rb") as input_stream:
        data = input_stream.read()

    assert b"foo" not in data

    reader = PdfReader(tmp_filename, password="******")
    new_text = reader.pages[0].extract_text()
    assert reader.metadata.get("/Producer") == "PyPDF2"

    assert new_text == orig_text

    # Cleanup
    os.remove(tmp_filename)
Example #5
0
def test_io_streams():
    """This is the example from the docs ("Streaming data")."""

    filepath = os.path.join(RESOURCE_ROOT, "pdflatex-outline.pdf")
    with open(filepath, "rb") as fh:
        bytes_stream = BytesIO(fh.read())

    # Read from bytes stream
    reader = PdfReader(bytes_stream)
    assert len(reader.pages) == 4

    # Write to bytes stream
    writer = PdfWriter()
    with BytesIO() as output_stream:
        writer.write(output_stream)
Example #6
0
def test_regression_issue670():
    filepath = os.path.join(RESOURCE_ROOT, "crazyones.pdf")
    reader = PdfReader(filepath, strict=False)
    for _ in range(2):
        writer = PdfWriter()
        writer.add_page(reader.pages[0])
        with open("dont_commit_issue670.pdf", "wb") as f_pdf:
            writer.write(f_pdf)
Example #7
0
def test_add_bookmark():
    reader = PdfReader(os.path.join(RESOURCE_ROOT, "pdflatex-outline.pdf"))
    writer = PdfWriter()

    for page in reader.pages:
        writer.add_page(page)

    bookmark = writer.add_bookmark("A bookmark", 1, None, (255, 0, 15), True,
                                   True, "/Fit", 200, 0, None)
    writer.add_bookmark("Another", 2, bookmark, None, False, False, "/Fit", 0,
                        0, None)

    # write "output" to PyPDF2-output.pdf
    tmp_filename = "dont_commit_bookmark.pdf"
    with open(tmp_filename, "wb") as output_stream:
        writer.write(output_stream)

    # Cleanup
    os.remove(tmp_filename)
Example #8
0
def test_fill_form():
    reader = PdfReader(os.path.join(RESOURCE_ROOT, "form.pdf"))
    writer = PdfWriter()

    page = reader.pages[0]

    writer.add_page(page)

    writer.update_page_form_field_values(writer.pages[0],
                                         {"foo": "some filled in text"},
                                         flags=1)

    # write "output" to PyPDF2-output.pdf
    tmp_filename = "dont_commit_filled_pdf.pdf"
    with open(tmp_filename, "wb") as output_stream:
        writer.write(output_stream)
Example #9
0
def test_remove_text(input_path, ignore_byte_string_object):
    pdf_path = os.path.join(RESOURCE_ROOT, input_path)

    reader = PdfReader(pdf_path)
    writer = PdfWriter()

    page = reader.pages[0]
    writer.insert_page(page, 0)
    writer.remove_text(ignore_byte_string_object=ignore_byte_string_object)

    # finally, write "output" to PyPDF2-output.pdf
    tmp_filename = "dont_commit_writer_removed_text.pdf"
    with open(tmp_filename, "wb") as output_stream:
        writer.write(output_stream)

    # Cleanup
    os.remove(tmp_filename)
Example #10
0
def test_issue301():
    """
    Test with invalid stream length object
    """
    with open(os.path.join(RESOURCE_ROOT, "issue-301.pdf"), "rb") as f:
        reader = PdfReader(f)
        writer = PdfWriter()
        writer.append_pages_from_reader(reader)
        o = BytesIO()
        writer.write(o)
Example #11
0
def test_remove_child_in_tree():
    pdf = os.path.join(RESOURCE_ROOT, "form.pdf")

    tree = TreeObject()
    reader = PdfReader(pdf)
    writer = PdfWriter()
    writer.add_page(reader.pages[0])
    writer.add_bookmark("foo", pagenum=0)
    obj = writer._objects[-1]
    tree.add_child(obj, writer)
    tree.remove_child(obj)
    tree.add_child(obj, writer)
    tree.empty_tree()
Example #12
0
def test_remove_images(input_path, ignore_byte_string_object):
    pdf_path = os.path.join(RESOURCE_ROOT, input_path)

    reader = PdfReader(pdf_path)
    writer = PdfWriter()

    page = reader.pages[0]
    writer.insert_page(page, 0)
    writer.remove_images(ignore_byte_string_object=ignore_byte_string_object)

    # finally, write "output" to PyPDF2-output.pdf
    tmp_filename = "dont_commit_writer_removed_image.pdf"
    with open(tmp_filename, "wb") as output_stream:
        writer.write(output_stream)

    with open(tmp_filename, "rb") as input_stream:
        reader = PdfReader(input_stream)
        if input_path == "side-by-side-subfig.pdf":
            extracted_text = reader.pages[0].extract_text()
            assert "Lorem ipsum dolor sit amet" in extracted_text

    # Cleanup
    os.remove(tmp_filename)
Example #13
0
def test_overlay(base_path, overlay_path):
    if base_path.startswith("http"):
        base_path = BytesIO(get_pdf_from_url(base_path,
                                             name="tika-935981.pdf"))
    else:
        base_path = os.path.join(PROJECT_ROOT, base_path)
    reader = PdfReader(base_path)
    writer = PdfWriter()

    reader_overlay = PdfReader(os.path.join(PROJECT_ROOT, overlay_path))
    overlay = reader_overlay.pages[0]

    for page in reader.pages:
        page.merge_page(overlay)
        writer.add_page(page)
    with open("dont_commit_overlay.pdf", "wb") as fp:
        writer.write(fp)

    # Cleanup
    os.remove("dont_commit_overlay.pdf")
Example #14
0
def test_basic_features():
    pdf_path = os.path.join(RESOURCE_ROOT, "crazyones.pdf")
    reader = PdfReader(pdf_path)
    writer = PdfWriter()

    assert len(reader.pages) == 1

    # add page 1 from input1 to output document, unchanged
    writer.add_page(reader.pages[0])

    # add page 2 from input1, but rotated clockwise 90 degrees
    writer.add_page(reader.pages[0].rotate(90))

    # add page 3 from input1, but first add a watermark from another PDF:
    page3 = reader.pages[0]
    watermark_pdf = pdf_path
    watermark = PdfReader(watermark_pdf)
    page3.merge_page(watermark.pages[0])
    writer.add_page(page3)

    # add page 4 from input1, but crop it to half size:
    page4 = reader.pages[0]
    page4.mediabox.upper_right = (
        page4.mediabox.right / 2,
        page4.mediabox.top / 2,
    )
    writer.add_page(page4)

    # add some Javascript to launch the print window on opening this PDF.
    # the password dialog may prevent the print dialog from being shown,
    # comment the the encription lines, if that's the case, to try this out
    writer.add_js("this.print({bUI:true,bSilent:false,bShrinkToFit:true});")

    # encrypt your new PDF and add a password
    password = "******"
    writer.encrypt(password)

    # finally, write "output" to PyPDF2-output.pdf
    tmp_path = "PyPDF2-output.pdf"
    with open(tmp_path, "wb") as output_stream:
        writer.write(output_stream)

    # cleanup
    os.remove(tmp_path)
Example #15
0
def test_add_link():
    reader = PdfReader(os.path.join(RESOURCE_ROOT, "pdflatex-outline.pdf"))
    writer = PdfWriter()

    for page in reader.pages:
        writer.add_page(page)

    from PyPDF2.generic import RectangleObject

    writer.add_link(
        1,
        2,
        RectangleObject([0, 0, 100, 100]),
        border=[1, 2, 3, [4]],
        fit="/Fit",
    )
    writer.add_link(2, 3, RectangleObject([20, 30, 50, 80]), [1, 2, 3],
                    "/FitH", None)
    writer.add_link(
        3,
        0,
        "[ 200 300 250 350 ]",
        [0, 0, 0],
        "/XYZ",
        0,
        0,
        2,
    )
    writer.add_link(
        3,
        0,
        [100, 200, 150, 250],
        border=[0, 0, 0],
    )

    # write "output" to PyPDF2-output.pdf
    tmp_filename = "dont_commit_link.pdf"
    with open(tmp_filename, "wb") as output_stream:
        writer.write(output_stream)

    # Cleanup
    os.remove(tmp_filename)
Example #16
0
def test_add_uri():
    reader = PdfReader(os.path.join(RESOURCE_ROOT, "pdflatex-outline.pdf"))
    writer = PdfWriter()

    for page in reader.pages:
        writer.add_page(page)

    from PyPDF2.generic import RectangleObject

    writer.add_uri(
        1,
        "http://www.example.com",
        RectangleObject([0, 0, 100, 100]),
        border=[1, 2, 3, [4]],
    )
    writer.add_uri(
        2,
        "https://pypdf2.readthedocs.io/en/latest/",
        RectangleObject([20, 30, 50, 80]),
        border=[1, 2, 3],
    )
    writer.add_uri(
        3,
        "https://pypdf2.readthedocs.io/en/latest/user/adding-pdf-annotations.html",
        "[ 200 300 250 350 ]",
        border=[0, 0, 0],
    )
    writer.add_uri(
        3,
        "https://pypdf2.readthedocs.io/en/latest/user/adding-pdf-annotations.html",
        [100, 200, 150, 250],
        border=[0, 0, 0],
    )

    # write "output" to PyPDF2-output.pdf
    tmp_filename = "dont_commit_uri.pdf"
    with open(tmp_filename, "wb") as output_stream:
        writer.write(output_stream)

    # Cleanup
    os.remove(tmp_filename)
Example #17
0
def test_add_named_destination():
    reader = PdfReader(os.path.join(RESOURCE_ROOT, "pdflatex-outline.pdf"))
    writer = PdfWriter()

    for page in reader.pages:
        writer.add_page(page)

    from PyPDF2.generic import NameObject

    writer.add_named_destination(NameObject("A named dest"), 2)
    writer.add_named_destination(NameObject("A named dest2"), 2)

    from PyPDF2.generic import IndirectObject

    assert writer.get_named_dest_root() == [
        "A named dest",
        IndirectObject(7, 0, writer),
        "A named dest2",
        IndirectObject(10, 0, writer),
    ]

    # write "output" to PyPDF2-output.pdf
    tmp_filename = "dont_commit_named_destination.pdf"
    with open(tmp_filename, "wb") as output_stream:
        writer.write(output_stream)

    # Cleanup
    os.remove(tmp_filename)
Example #18
0
def pdf_file_writer():
    reader = PdfReader(os.path.join(RESOURCE_ROOT, "crazyones.pdf"))
    writer = PdfWriter()
    writer.append_pages_from_reader(reader)
    return writer
Example #19
0
def test_writer_operations():
    """
    This test just checks if the operation throws an exception.

    This should be done way more thoroughly: It should be checked if the
    output is as expected.
    """
    pdf_path = os.path.join(RESOURCE_ROOT, "crazyones.pdf")
    pdf_outline_path = os.path.join(RESOURCE_ROOT, "pdflatex-outline.pdf")

    reader = PdfReader(pdf_path)
    reader_outline = PdfReader(pdf_outline_path)

    writer = PdfWriter()
    page = reader.pages[0]
    with pytest.raises(PageSizeNotDefinedError) as exc:
        writer.add_blank_page()
    assert exc.value.args == ()
    writer.insert_page(page, 1)
    writer.insert_page(reader_outline.pages[0], 0)
    writer.add_bookmark_destination(page)
    writer.remove_links()
    writer.add_bookmark_destination(page)
    bm = writer.add_bookmark("A bookmark", 0, None, (255, 0, 15), True, True,
                             "/FitBV", 10)
    writer.add_bookmark("The XYZ fit", 0, bm, (255, 0, 15), True, True, "/XYZ",
                        10, 20, 3)
    writer.add_bookmark("The FitH fit", 0, bm, (255, 0, 15), True, True,
                        "/FitH", 10)
    writer.add_bookmark("The FitV fit", 0, bm, (255, 0, 15), True, True,
                        "/FitV", 10)
    writer.add_bookmark("The FitR fit", 0, bm, (255, 0, 15), True, True,
                        "/FitR", 10, 20, 30, 40)
    writer.add_bookmark("The FitB fit", 0, bm, (255, 0, 15), True, True,
                        "/FitB")
    writer.add_bookmark("The FitBH fit", 0, bm, (255, 0, 15), True, True,
                        "/FitBH", 10)
    writer.add_bookmark("The FitBV fit", 0, bm, (255, 0, 15), True, True,
                        "/FitBV", 10)
    writer.add_blank_page()
    writer.add_uri(2, "https://example.com", RectangleObject([0, 0, 100, 100]))
    writer.add_link(2, 1, RectangleObject([0, 0, 100, 100]))
    assert writer._get_page_layout() is None
    writer._set_page_layout("/SinglePage")
    assert writer._get_page_layout() == "/SinglePage"
    assert writer._get_page_mode() is None
    writer.set_page_mode("/UseNone")
    assert writer._get_page_mode() == "/UseNone"
    writer.insert_blank_page(width=100, height=100)
    writer.insert_blank_page()  # without parameters

    # TODO: This gives "KeyError: '/Contents'" - is that a bug?
    # writer.removeImages()

    writer.add_metadata({"author": "Martin Thoma"})

    writer.add_attachment("foobar.gif", b"foobarcontent")

    # finally, write "output" to PyPDF2-output.pdf
    tmp_path = "dont_commit_writer.pdf"
    with open(tmp_path, "wb") as output_stream:
        writer.write(output_stream)

    # cleanup
    os.remove(tmp_path)
Example #20
0
def render_to_pdf(html, filename, xml_filename, env={}):
    "Render html file to pdf using the given filename and environment. Embed xml data if available"
    debug = settings.DEBUG
    debug = False
    if debug:
        return HttpResponse(html)

    fd_html, filename_html = tempfile.mkstemp()
    fd_pdf, filename_pdf = tempfile.mkstemp(suffix=".pdf")
    _, filename_pdf2 = tempfile.mkstemp(suffix=".pdf")
    os.close(fd_pdf)
    try:
        with open(fd_html, 'wb') as f:
            f.write(html.encode('utf8'))
        path = os.path.join(os.path.dirname(__file__), '..', 'webkit',
                            'webkit2pdf')

        try:
            if not debug:
                env['DISPLAY'] = ':1'
                # Fake empty SSL Confif file to be able to run phantomjs in Buster
                env['OPENSSL_CONF'] = os.path.abspath(
                    os.path.join(os.path.dirname(__file__), '..', 'webkit',
                                 'openssl.cnf'))
        except KeyError:
            pass

        env.update(dict(os.environ))  # keep OS env vars, such as PATH

        cmd = [
            path, "-f", filename_html, "-o", filename_pdf, "--mediaroot",
            settings.MEDIA_ROOT, "--staticroot", settings.STATIC_ROOT,
            "--scriptname", settings.FORCE_SCRIPT_NAME or ''
        ]
        proc = subprocess.Popen(cmd, env=env)

        while True:
            proc.poll()
            if proc.returncode is not None:
                break

    except Exception as error:
        print(error)
    finally:
        os.remove(filename_html)

    if xml_filename:
        with open(filename_pdf, 'rb') as pdf:
            reader = PdfReader(pdf, strict=False)
            writer = PdfWriter()
            writer.appendPagesFromReader(reader)
            with open(xml_filename, "rb") as xml:
                writer.addAttachment("certificado.xml", xml.read())
                with open(filename_pdf2, "wb") as out:
                    writer.write(out)
                    out.close()
                pdf.close()

        with open(filename_pdf2, 'rb') as pdf:
            response = HttpResponse(pdf.read(), content_type='application/pdf')
            response[
                'Content-Disposition'] = 'attachment;filename=%s' % filename

            pdf.close()

            os.remove(filename_pdf)
            os.remove(filename_pdf2)
            return response

    else:
        with open(filename_pdf, 'rb') as pdf:
            response = HttpResponse(pdf.read(), content_type='application/pdf')
            response[
                'Content-Disposition'] = 'attachment;filename=%s' % filename

            pdf.close()

            os.remove(filename_pdf)
            os.remove(filename_pdf2)
            return response
Example #21
0
def test_remove_text_all_operators(ignore_byte_string_object):
    stream = (b"BT "
              b"/F0 36 Tf "
              b"50 706 Td "
              b"36 TL "
              b"(The Tj operator) Tj "
              b'1 2 (The double quote operator) " '
              b"(The single quote operator) ' "
              b"ET")
    pdf_data = (
        b"%%PDF-1.7\n"
        b"1 0 obj << /Count 1 /Kids [5 0 R] /Type /Pages >> endobj\n"
        b"2 0 obj << >> endobj\n"
        b"3 0 obj << >> endobj\n"
        b"4 0 obj << /Length %d >>\n"
        b"stream\n" + (b"%s\n" % stream) + b"endstream\n"
        b"endobj\n"
        b"5 0 obj << /Contents 4 0 R /CropBox [0.0 0.0 2550.0 3508.0]\n"
        b" /MediaBox [0.0 0.0 2550.0 3508.0] /Parent 1 0 R"
        b" /Resources << /Font << >> >>"
        b" /Rotate 0 /Type /Page >> endobj\n"
        b"6 0 obj << /Pages 1 0 R /Type /Catalog >> endobj\n"
        b"xref 1 6\n"
        b"%010d 00000 n\n"
        b"%010d 00000 n\n"
        b"%010d 00000 n\n"
        b"%010d 00000 n\n"
        b"%010d 00000 n\n"
        b"%010d 00000 n\n"
        b"trailer << /Root 6 0 R /Size 6 >>\n"
        b"startxref\n%d\n"
        b"%%%%EOF")
    startx_correction = -1
    pdf_data = pdf_data % (
        len(stream),
        pdf_data.find(b"1 0 obj") + startx_correction,
        pdf_data.find(b"2 0 obj") + startx_correction,
        pdf_data.find(b"3 0 obj") + startx_correction,
        pdf_data.find(b"4 0 obj") + startx_correction,
        pdf_data.find(b"5 0 obj") + startx_correction,
        pdf_data.find(b"6 0 obj") + startx_correction,
        # startx_correction should be -1 due to double % at the beginning indiducing an error on startxref computation
        pdf_data.find(b"xref"),
    )
    print(pdf_data.decode())
    pdf_stream = BytesIO(pdf_data)

    reader = PdfReader(pdf_stream, strict=False)
    writer = PdfWriter()

    page = reader.pages[0]
    writer.insert_page(page, 0)
    writer.remove_text(ignore_byte_string_object=ignore_byte_string_object)

    # finally, write "output" to PyPDF2-output.pdf
    tmp_filename = "dont_commit_writer_removed_text.pdf"
    with open(tmp_filename, "wb") as output_stream:
        writer.write(output_stream)

    # Cleanup
    os.remove(tmp_filename)