Ejemplo n.º 1
0
def main():
    output = PdfFileWriter()
    input1 = PdfFileReader(
        open("/Users/odin/Downloads/What-is-Life-By-Erwin-Schrodinger.pdf",
             "rb"))
    print("document1.pdf has %d pages." % input1.getNumPages())
    page1 = input1.getPage(0)
    print(f'raw size of pdf pages: {page1.mediaBox}')

    width_trim_1 = 75
    width_trim_2 = 3
    top_trim = 52
    bottom_trim = 60
    first_half_split = RectangleObject(
        [width_trim_1, bottom_trim, 306 - width_trim_2, 792 - top_trim])
    second_half_split = RectangleObject(
        [306 + width_trim_2, bottom_trim, 612 - width_trim_1, 792 - top_trim])

    for page_num in range(0, input1.getNumPages()):
        p = input1.getPage(page_num)
        p_copy = copy(p)
        p.mediaBox = first_half_split
        p_copy.mediaBox = second_half_split
        output.addPage(p)
        output.addPage(p_copy)

    output_stream = open("pdf_to_text_output.pdf", "wb")
    output.write(output_stream)
    output_stream.close()
Ejemplo n.º 2
0
def test_writer_operations():
    """
    This test just checks if the operation throws an exception.

    This should be done way more thoroughly: It should be checked if the
    output is as expected.
    """
    pdf_path = os.path.join(RESOURCE_ROOT, "crazyones.pdf")
    pdf_outline_path = os.path.join(RESOURCE_ROOT, "pdflatex-outline.pdf")

    reader = PdfFileReader(pdf_path)
    reader_outline = PdfFileReader(pdf_outline_path)

    writer = PdfFileWriter()
    page = reader.pages[0]
    with pytest.raises(PageSizeNotDefinedError) as exc:
        writer.addBlankPage()
    assert exc.value.args == ()
    writer.insertPage(page, 1)
    writer.removeText()
    writer.insertPage(reader_outline.pages[0], 0)
    writer.addBookmarkDestination(page)
    writer.addBookmark("A bookmark", 0)
    # output.addNamedDestination("A named destination", 1)
    writer.removeLinks()
    # assert output.getNamedDestRoot() == ['A named destination', IndirectObject(9, 0, output)]
    writer.addBlankPage()
    writer.addURI(2, "https://example.com", RectangleObject([0, 0, 100, 100]))
    writer.addLink(2, 1, RectangleObject([0, 0, 100, 100]))
    assert writer.getPageLayout() is None
    writer.setPageLayout("SinglePage")
    assert writer.getPageLayout() == "SinglePage"
    assert writer.getPageMode() is None
    writer.setPageMode("UseNone")
    assert writer.getPageMode() == "UseNone"
    writer.insertBlankPage(width=100, height=100)
    writer.insertBlankPage()  # without parameters

    # This gives "KeyError: '/Contents'" - is that a bug?
    # output.removeImages()

    writer.addMetadata({"author": "Martin Thoma"})

    writer.addAttachment("foobar.gif", b"foobarcontent")

    # finally, write "output" to PyPDF2-output.pdf
    tmp_path = "dont_commit_writer.pdf"
    with open(tmp_path, "wb") as output_stream:
        writer.write(output_stream)

    # cleanup
    os.remove(tmp_path)
Ejemplo n.º 3
0
 def process_page(self, page_idx, bbox_func, *args, **kwargs):
     """Process a single page and add it to the writer """
     tmpfname = self.export_page(page_idx)
     bbox = bbox_func(tmpfname, *args, **kwargs)
     thepage = self.reader.getPage(page_idx)
     thepage.cropBox = RectangleObject(bbox)
     self.writer.addPage(thepage)
     os.unlink(tmpfname)
     return 0
Ejemplo n.º 4
0
def transformAnnot(p, rot, ratio, tx, ty):
    """transform the Annotations of a pdf page (takes the page and the same arguments like mergeRotatedScaledTranslatedPage)"""
    if '/Annots' in p:
        for a in p['/Annots']:
            annot = a.getObject()
            r = RectangleObject(annot['/Rect'])
            (x0, y0) = r.upperLeft
            (x1, y1) = r.lowerRight
            if rot == 90:
                x0, y0 = y0, x0
                x1, y1 = y1, x1
            annot.update({
                NameObject('/Rect'):
                RectangleObject([
                    x0 * ratio + tx, y0 * ratio + ty, x1 * ratio + tx,
                    y1 * ratio + ty
                ])
            })
Ejemplo n.º 5
0
def test_add_link():
    reader = PdfReader(os.path.join(RESOURCE_ROOT, "pdflatex-outline.pdf"))
    writer = PdfWriter()

    for page in reader.pages:
        writer.add_page(page)

    from PyPDF2.generic import RectangleObject

    writer.add_link(
        1,
        2,
        RectangleObject([0, 0, 100, 100]),
        border=[1, 2, 3, [4]],
        fit="/Fit",
    )
    writer.add_link(2, 3, RectangleObject([20, 30, 50, 80]), [1, 2, 3],
                    "/FitH", None)
    writer.add_link(
        3,
        0,
        "[ 200 300 250 350 ]",
        [0, 0, 0],
        "/XYZ",
        0,
        0,
        2,
    )
    writer.add_link(
        3,
        0,
        [100, 200, 150, 250],
        border=[0, 0, 0],
    )

    # write "output" to PyPDF2-output.pdf
    tmp_filename = "dont_commit_link.pdf"
    with open(tmp_filename, "wb") as output_stream:
        writer.write(output_stream)

    # Cleanup
    os.remove(tmp_filename)
Ejemplo n.º 6
0
def test_add_uri():
    reader = PdfReader(os.path.join(RESOURCE_ROOT, "pdflatex-outline.pdf"))
    writer = PdfWriter()

    for page in reader.pages:
        writer.add_page(page)

    from PyPDF2.generic import RectangleObject

    writer.add_uri(
        1,
        "http://www.example.com",
        RectangleObject([0, 0, 100, 100]),
        border=[1, 2, 3, [4]],
    )
    writer.add_uri(
        2,
        "https://pypdf2.readthedocs.io/en/latest/",
        RectangleObject([20, 30, 50, 80]),
        border=[1, 2, 3],
    )
    writer.add_uri(
        3,
        "https://pypdf2.readthedocs.io/en/latest/user/adding-pdf-annotations.html",
        "[ 200 300 250 350 ]",
        border=[0, 0, 0],
    )
    writer.add_uri(
        3,
        "https://pypdf2.readthedocs.io/en/latest/user/adding-pdf-annotations.html",
        [100, 200, 150, 250],
        border=[0, 0, 0],
    )

    # write "output" to PyPDF2-output.pdf
    tmp_filename = "dont_commit_uri.pdf"
    with open(tmp_filename, "wb") as output_stream:
        writer.write(output_stream)

    # Cleanup
    os.remove(tmp_filename)
Ejemplo n.º 7
0
def test_page_properties():
    reader = PdfReader(os.path.join(RESOURCE_ROOT, "crazyones.pdf"))
    page = reader.pages[0]
    assert page.mediabox == RectangleObject((0, 0, 612, 792))
    assert page.cropbox == RectangleObject((0, 0, 612, 792))
    assert page.bleedbox == RectangleObject((0, 0, 612, 792))
    assert page.trimbox == RectangleObject((0, 0, 612, 792))
    assert page.artbox == RectangleObject((0, 0, 612, 792))

    page.bleedbox = RectangleObject((0, 1, 100, 101))
    assert page.bleedbox == RectangleObject((0, 1, 100, 101))
Ejemplo n.º 8
0
def add_comment(output, page, text, rectangle):
    obj = output._addObject(
        DictionaryObject({
            NameObject('/DA'):
            TextStringObject(' /Helv 10 Tf'),
            NameObject('/Subtype'):
            NameObject('/FreeText'),
            NameObject('/Rect'):
            RectangleObject(rectangle),
            NameObject('/Type'):
            NameObject('/Annot'),
            NameObject('/Contents'):
            TextStringObject(text),
            NameObject('/C'):
            ArrayObject([FloatObject(1),
                         FloatObject(1),
                         FloatObject(1)]),
        }))
    page['/Annots'].append(obj)
Ejemplo n.º 9
0
def test_RectangleObject():
    ro = RectangleObject((1, 2, 3, 4))
    assert ro.lower_left == (1, 2)
    assert ro.lower_right == (3, 2)
    assert ro.upper_left == (1, 4)
    assert ro.upper_right == (3, 4)

    ro.lower_left = (5, 6)
    assert ro.lower_left == (5, 6)

    ro.lower_right = (7, 8)
    assert ro.lower_right == (7, 8)

    ro.upper_left = (9, 11)
    assert ro.upper_left == (9, 11)

    ro.upper_right = (13, 17)
    assert ro.upper_right == (13, 17)
Ejemplo n.º 10
0
def test_writer_operations():
    """
    This test just checks if the operation throws an exception.

    This should be done way more thoroughly: It should be checked if the
    output is as expected.
    """
    pdf_path = os.path.join(RESOURCE_ROOT, "crazyones.pdf")
    pdf_outline_path = os.path.join(RESOURCE_ROOT, "pdflatex-outline.pdf")

    reader = PdfReader(pdf_path)
    reader_outline = PdfReader(pdf_outline_path)

    writer = PdfWriter()
    page = reader.pages[0]
    with pytest.raises(PageSizeNotDefinedError) as exc:
        writer.add_blank_page()
    assert exc.value.args == ()
    writer.insert_page(page, 1)
    writer.insert_page(reader_outline.pages[0], 0)
    writer.add_bookmark_destination(page)
    writer.remove_links()
    writer.add_bookmark_destination(page)
    bm = writer.add_bookmark("A bookmark", 0, None, (255, 0, 15), True, True,
                             "/FitBV", 10)
    writer.add_bookmark("The XYZ fit", 0, bm, (255, 0, 15), True, True, "/XYZ",
                        10, 20, 3)
    writer.add_bookmark("The FitH fit", 0, bm, (255, 0, 15), True, True,
                        "/FitH", 10)
    writer.add_bookmark("The FitV fit", 0, bm, (255, 0, 15), True, True,
                        "/FitV", 10)
    writer.add_bookmark("The FitR fit", 0, bm, (255, 0, 15), True, True,
                        "/FitR", 10, 20, 30, 40)
    writer.add_bookmark("The FitB fit", 0, bm, (255, 0, 15), True, True,
                        "/FitB")
    writer.add_bookmark("The FitBH fit", 0, bm, (255, 0, 15), True, True,
                        "/FitBH", 10)
    writer.add_bookmark("The FitBV fit", 0, bm, (255, 0, 15), True, True,
                        "/FitBV", 10)
    writer.add_blank_page()
    writer.add_uri(2, "https://example.com", RectangleObject([0, 0, 100, 100]))
    writer.add_link(2, 1, RectangleObject([0, 0, 100, 100]))
    assert writer._get_page_layout() is None
    writer._set_page_layout("/SinglePage")
    assert writer._get_page_layout() == "/SinglePage"
    assert writer._get_page_mode() is None
    writer.set_page_mode("/UseNone")
    assert writer._get_page_mode() == "/UseNone"
    writer.insert_blank_page(width=100, height=100)
    writer.insert_blank_page()  # without parameters

    # TODO: This gives "KeyError: '/Contents'" - is that a bug?
    # writer.removeImages()

    writer.add_metadata({"author": "Martin Thoma"})

    writer.add_attachment("foobar.gif", b"foobarcontent")

    # finally, write "output" to PyPDF2-output.pdf
    tmp_path = "dont_commit_writer.pdf"
    with open(tmp_path, "wb") as output_stream:
        writer.write(output_stream)

    # cleanup
    os.remove(tmp_path)
Ejemplo n.º 11
0
def validate_mediabox(mediabox: RectangleObject, options):
    width, height = (inch * options["page-size"]["width"], inch * options["page-size"]["height"])

    return (mediabox.getWidth() == width and mediabox.getHeight() == height) or (mediabox.getWidth() == height and mediabox.getHeight() == width)
Ejemplo n.º 12
0
def annotate(fp_in, annotations):
    reader = PdfFileReader(fp_in)
    pdf = PdfFileWriter()
    for page in reader.pages:
        pdf.addPage(page)

    for annotation in annotations:
        page = annotation.get('page', 0)
        try:
            pdfpage = pdf.getPage(page)
        except IndexError:
            print >> sys.stderr, 'Page %d not found in pdf, not adding annotations %r' % (
                page, annotation)
            continue

        size = pdfpage.mediaBox
        angle = int(pdfpage.get('/Rotate', 0))
        x = annotation['x']
        y = annotation['y']
        if angle == 0:
            x = float(x)
            y = size[3] - float(y) - 20
        elif angle == 90:
            x, y = float(y) - 2, float(x) - 15
        else:
            x = float(x)
            y = float(y)
            print >> sys.stderr, 'Page rotated by %d degrees not implemented yet' % (
                angle)

        color = annotation.get('color', None)
        if isinstance(color, basestring):
            if color[:1] != '#':
                print >> sys.stderr, 'Unsupported color format: %s' % (color)
                color = None
            else:
                # Assume HTML color with format "#RRGGBB".
                try:
                    color = int(color[1:], 16)
                except ValueError as e:
                    print >> sys.stderr, 'Unsupported color format: %s (%s)' % (
                        color, e)
                    color = None

        if color is not None:
            r, g, b = color >> 16, (color >> 8) & 0xff, color & 0xff
            color = (r * BYTE_TO_COLOR, g * BYTE_TO_COLOR, b * BYTE_TO_COLOR)
        else:
            color = None

        pages = pdf.getObject(pdf._pages)
        pageref = pages["/Kids"][page]

        anno = DictionaryObject()
        anno.update({
            NameObject('/Type'):
            NameObject('/Annot'),
            NameObject('/Subtype'):
            NameObject('/Text'),
            NameObject('/P'):
            pageref,
            NameObject('/Rect'):
            RectangleObject([x, y, x + 18, y + 20]),
            NameObject('/Contents'):
            TextStringObject(annotation['text']),
            NameObject('/C'):
            ArrayObject([FloatObject(x) for x in color]),
            NameObject('/Open'):
            BooleanObject(True),
        })
        author = annotation.get('author', None)
        if author:
            anno[NameObject('/T')] = TextStringObject(author)
        modified = annotation.get('modified', None)
        if modified:
            modified = time.strftime('%Y%m%d%H%M%SZ', time.gmtime(modified))
            anno[NameObject('/M')] = TextStringObject(modified)

        annoRef = pdf._addObject(anno)
        annots = pdfpage.get('/Annots', None)
        if annots is None:
            annots = pdfpage[NameObject('/Annots')] = ArrayObject([annoRef])
        else:
            annots.append(annoRef)

    fp_out = StringIO()
    pdf.write(fp_out)
    return fp_out.getvalue()
Ejemplo n.º 13
0
def applyTransform():
    r = RectangleObject([0, 0])
    x, y = r.lowerLeft