Python PdfFileReader Examples, pypdf.pdf.PdfFileReader Python Examples

Example #1

0

Show file

    def testJpegImage(self):
        """
        Test loading and parsing of a file. Extract the image of the file and
        compare to expected textual output. Expected outcome: file loads, image
        matches expected.
        """
        with open(join(TEST_DATA_ROOT, 'jpeg.pdf'), 'rb') as inputfile:
            # Load PDF file from file
            r = PdfFileReader(inputfile)

            # Retrieve the text of the image
            with open(join(self.localDataRoot, 'jpeg.txt'),
                      'r') as pdftextFile:
                imagetext = pdftextFile.read()

            page1 = r.getPage(0)
            xObject = page1['/Resources']['/XObject'].getObject()
            data = xObject['/Im4'].getData()

            # Compare the text of the PDF to a known source
            self.assertEqual(
                binascii.hexlify(data).decode(),
                imagetext,
                msg='PDF extracted image differs from expected value.'
                '\n\nExpected:\n\n%r\n\nExtracted:\n\n%r\n\n' %
                (imagetext, binascii.hexlify(data).decode()))

            r.close()

Example #2

0

Show file

File: test_generic.py Project: zcatbear/PyPDF4

    def testObjectIds(self):
        """
        Tests the ``ObjectStream.objectIds()`` method.
        """
        expResults = (
            (8, 3, 10, 2, 1, 11, 13, 15, 4, 19, 5, 20, 6, 21, 17),
            (644, 642, 646, 647, 648, 122, 119, 120, 121, 124, 179, 232, 327,
             467, 478, 519, 568, 573, 580, 586, 592, 598, 603, 611, 616, 623,
             629, 634),
        )
        # Files we know to have Object Streams within
        inputData = (
            # (filename, id, generation number)
            ("crazyones.pdf", 9, 0),
            ("GeoBase_NHNC1_Data_Model_UML_EN.pdf", 645, 0),
        )

        for o, d in zip(expResults, inputData):
            filepath = join(TESTS_DATA_ROOT, d[0])
            r = PdfFileReader(filepath)
            ref = IndirectObject(d[1], d[2], r)
            objStm = r.getObject(ref)

            r.close()

            self.assertIsInstance(objStm, ObjectStream)
            self.assertTupleEqual(tuple(o), tuple(objStm.objectIds))

Example #3

0

Show file

    def testDel(self):
        """
        Tests the ``__del__()`` method of ``PdfFileReader`` and
        ``PdfFileWriter`` ensuring that no exceptions are raised.
        """
        r = PdfFileReader(join(TEST_DATA_ROOT, "crazyones.pdf"))
        w = PdfFileWriter()

        try:
            # This may generate some collateral warnings in stderr when del r
            # is performed by the GC
            r.__del__()
            self.assertTrue(True)
        except Exception as e:
            self.assertTrue(
                False, "Exception '%s' was raised in %s.__del__()" %
                (e, PdfFileReader.__name__))

        try:
            w.__del__()
            self.assertTrue(True)
        except Exception as e:
            self.assertTrue(
                False, "Exception '%s' was raised in %s.__del__()" %
                (e, PdfFileWriter.__name__))

Example #4

0

Show file

    def testFileLoad(self):
        """
        Test loading and parsing of a file. Extract text of the file and
        compare to expected textual output. Expected outcome: file loads, text
        matches expected.
        """
        with open(join(TEST_DATA_ROOT, 'crazyones.pdf'), 'rb') as inputfile:
            # Load PDF file from file
            r = PdfFileReader(inputfile)
            page1 = r.getPage(0)

            # Retrieve the text of the PDF
            with open(join(self.localDataRoot, 'crazyones.txt'),
                      'rb') as pdftextFile:
                pdftext = pdftextFile.read()

            page1Text = page1.extractText().replace('\n', '').encode('utf-8')

            # Compare the text of the PDF to a known source
            self.assertEqual(
                pdftext,
                page1Text,
                msg='PDF extracted text differs from expected value.'
                '\n\nExpected:\n\n%r\n\nExtracted:\n\n%r\n\n' %
                (pdftext, page1Text))

            r.close()

Example #5

0

Show file

    def testAttachFiles(self):
        """
        Tests the addAttachment function for attaching multiple files.

        Since the Names array in the EmbeddedFiles dictionary contains both the
        name (string) and indirect object (dictionary) for each file, we have
        to check for two entries per attached file.
        """

        numAttachments = 3
        _, testfile = tempfile.mkstemp()

        try:
            # Make PDF with attachment
            with PdfFileReader(join(TEST_DATA_ROOT, 'jpeg.pdf')) as reader:
                with PdfFileWriter(testfile) as writer:
                    writer.appendPagesFromReader(reader)

                    writer.attachFiles(
                        [join(TEST_DATA_ROOT, 'attachment_small.png')] *
                        numAttachments)
                    writer.write()

            # Check for attachment entries
            with PdfFileReader(testfile) as pdf:
                pdf.numPages  # For caching _cachedObjects data
                for k, v in pdf._cachedObjects.items():
                    if '/Type' in v:
                        if v['/Type'] == '/Catalog':
                            self.assertIsNotNone(v['/Names']['/EmbeddedFiles'])
                            real = len(v['/Names']['/EmbeddedFiles']['/Names'])
                            self.assertEqual(numAttachments * 2, real)
        finally:
            os.remove(testfile)

Example #6

0

Show file

    def testXRefTableObjects(self):
        """
        Ensures that after ``PdfFileReader._parsePdfFile()`` all the indirect
        references from the XRef-Table *only* have been loaded as expected.
        Objects from the free entries list are included as well in the test.

        This case tests the part of ``PdfFileReader.objects()`` responsible for
        generating the Cross-Reference Table entries too.
        """
        self.maxDiff = None
        inputFiles = ("jpeg.pdf", "Seige_of_Vicksburg_Sample_OCR.pdf",
                      "SF424_page2.pdf")

        for filename in inputFiles:
            filepath = join(TEST_DATA_ROOT, filename)
            xtablepath = join(self.localDataRoot, filename)
            r = PdfFileReader(filepath)
            # The two below are (id, gen, byte offset)-valued lists
            actualItems = list()
            expItems = list()

            for ref in r.objects(PdfFileReader.R_XTABLE, True):
                actualItems.append(
                    (ref.idnum, ref.generation,
                     r._xrefTable[ref.generation][ref.idnum][0]))

            r.close()
            # We artificially read the XRef Table entries that we know belong
            # to filepath, and store them into expItems.
            expItems = sorted(self._parseXRefTable(xtablepath, (0, 1, 2)))
            actualItems = sorted(actualItems)
            expItems = sorted(expItems)

            self.assertListEqual(expItems, actualItems,
                                 "Differences found in " + filename)

Example #7

0

Show file

    def testDel(self):
        """
        Tests the ``__del__()`` method of ``PdfFileReader`` and
        ``PdfFileWriter`` ensuring that no exceptions are raised.
        """
        r = PdfFileReader(join(TEST_DATA_ROOT, "crazyones.pdf"))
        w = PdfFileWriter(BytesIO(b""))

        try:
            r.__del__()
            self.assertTrue(True)
        except Exception as e:  # pylint: disable=broad-except
            self.assertTrue(
                False,
                "Exception '%s' was raised in %s.__del__()" %
                (e, PdfFileReader.__name__),
            )

        try:
            w.__del__()
            self.assertTrue(True)
        except Exception as e:  # pylint: disable=broad-except
            self.assertTrue(
                False,
                "Exception '%s' was raised in %s.__del__()" %
                (e, PdfFileWriter.__name__),
            )

Example #8

0

Show file

    def testDecodeStreamData(self):
        """ Stores PDF files infos and the coordinates of stream objects. We
            don't care if we need to open a new file stream for each obj.
            reference -- unit tests don't have to be efficient
        """
        this_dir = join(TEST_DATA_ROOT, self.testDecodeStreamData.__name__)
        filters = (
            # (filter type, filename, id, gen. number)
            (FlateCodec, "FlateDecode.pdf", 4, 0),
            (FlateCodec, "FlateDecode.pdf", 8, 0),
            (FlateCodec, "FlateDecode.pdf", 9, 0),
            # TO-DO No PDF files found with this type of encoding, get them.
            # (ASCIIHexCodec, "ASCIIHexDecode.pdf", ?, ?)
            (LZWCodec, "LZWDecode.pdf", 209, 0),
            (LZWCodec, "LZWDecode.pdf", 210, 0),
            (LZWCodec, "LZWDecode.pdf", 211, 0),
            (ASCII85Codec, "ASCII85Decode.pdf", 5, 0),
            (ASCII85Codec, "ASCII85Decode.pdf", 6, 0),
            (DCTCodec, "DCTDecode.pdf", 4, 0),
            # TO-DO No PDF files found with this type of encoding, get them.
            # (JPXCodec, "JPXDecode.pdf", ?, ?)
            (CCITTFaxCodec, "CCITTFaxDecode.pdf", 46, 0),
        )

        for f__ in filters:
            with open(join(this_dir, f__[1]), "rb") as infile:
                reader = PdfFileReader(infile)
                ref = IndirectObject(f__[2], f__[3], reader)
                stream = reader.getObject(ref)

                # Ensures that the PdfFileReader reads a stream object
                self.assertEqual(EncodedStreamObject, type(stream))

                # print("Running with %s!" % f[0].__name__)
                if f__[0] is CCITTFaxCodec:
                    self.assertEqual(
                        f__[0].decode(
                            stream._data,
                            stream.get("/DecodeParms"),
                            stream.get("/Height"),
                        ),
                        decodeStreamData(stream),
                    )
                else:
                    self.assertEqual(
                        f__[0].decode(stream._data, stream.get("/DecodeParms")),
                        decodeStreamData(stream),
                    )

Example #9

0

Show file

    def testReadXRefStreamCompressedObjects(self):
        """
        Targets the same objects as ``testXRefStreamObjects()``, but instead
        of ensuring an identity between the list of items read and the one
        expected, it verifies that their *contents* are identical.

        This method does **not** test ``PdfFileReader.objects()`` as two of the
        previous test cases did.
        """
        self.maxDiff = None
        inputFiles = ("crazyones.pdf", )
        # expItems and actualItems will contain two-element tuples, where the
        # first element is the object ID, used to sort.
        sortKey = lambda e: e[0]
        compressedObj = lambda e: e[1][0] == 2

        for filename in inputFiles:
            filepath = join(self.localDataRoot, filename)
            r = PdfFileReader(join(TEST_DATA_ROOT, filename))
            expItems = list()
            actualItems = list()

            with open(filepath, "rb") as instream:
                for line in instream:
                    if not line or line.isspace() or line.startswith(b"%"):
                        continue

                    globalId, offset, obj = line.split(b" ", 2)
                    globalId, offset = int(globalId), int(offset)

                    with BytesIO(obj) as objStream:
                        obj = readObject(objStream, r)

                    expItems.append((globalId, obj))

            for itemid, item in filter(compressedObj, r._xrefStm.items()):
                # We deal exclusively with compressed objects (from Table 18 of
                # ISO 32000 reference, 2008) whose generation number is 0
                actualItems.append(
                    # (ID, PdfObject) tuples
                    (itemid, IndirectObject(itemid, 0, r).getObject()))

            r.close()
            expItems = sorted(expItems, key=sortKey)
            actualItems = sorted(actualItems, key=sortKey)

            self.assertListEqual(expItems, actualItems)

Example #10

0

Show file

    def testXTableAgainstXStream(self):
        """
        In section 7.5.8.4 of ISO 32000, "Compatibility with Applications That
        Do Not Support Compressed Reference Streams", the standard describes a
        means of crafting PDF files designed for versions 1.5+ that can be
        opened nevertheless by readers that support older versions.

        This test case verifies that all the items hidden by the XRef Table in
        non-conforming readers are *all and exactly* loaded into the XRef
        Stream by readers that support PDF 1.5+.
        """
        self.maxDiff = None
        # TO-DO Possibly add a few other files to this test case
        inputFiles = ("GeoBase_NHNC1_Data_Model_UML_EN.pdf", )

        for filename in inputFiles:
            filepath = join(self.localDataRoot, filename)
            expItems = {
                e[0]: e[1:]
                for e in self._parseXRefTable(filepath, (0, 2, 3))
            }
            actualItems = list()
            r = PdfFileReader(join(TEST_DATA_ROOT, filename))

            for ref in r.objects(PdfFileReader.R_XSTREAM, True):
                actualItems.append(ref)

            r.close()
            actualItems = sorted(actualItems, key=lambda e: e.idnum)
            expKeys = sorted(expItems.keys())
            actualKeys = list(map(lambda e: e.idnum, actualItems))

            self.assertListEqual(expKeys, actualKeys,
                                 "Lists of item IDs are not identical")

            for e, a in zip(expKeys, actualItems):
                self.assertEqual(e, a.idnum, "Items ID does not correspond")

                # If an item is in use in the XRef Stream, ensure then that it
                # is marked free in the XRef Table.
                if r._xrefStm[a.idnum][0] in (2, ):
                    self.assertTrue(
                        expItems[e][-1],
                        "Item %d should be hid by the XRef Table, but it was "
                        "not." % e,
                    )

Example #11

0

Show file

    def testXRefStreamObjects(self):
        """
        Like ``PdfReaderTestCases.testXRefTableObjects()``, except that it
        tests objects referenced by the Cross-Reference Stream.
        ``PdfFileReader.objects()`` second part (dealing with XStream objects)
        is invoked and implicitly tested.
        """
        inputFiles = ("crazyones.pdf", )

        for filename in inputFiles:
            filepath = join(self.localDataRoot, filename)
            r = PdfFileReader(join(TEST_DATA_ROOT, filename))
            # Two lists of tuples as explained by Table 18
            actualItems = list()
            expItems = list()

            with open(filepath, "r") as instream:
                for line in instream:
                    if not line or line.isspace() or line.startswith("%"):
                        continue

                    this_type, field2, field3 = (int(f) for f in line.split())
                    expItems.append((this_type, field2, field3))

            for item in r.objects(PdfFileReader.R_XSTREAM, True):
                priv8Item = r._xrefStm[item.idnum]

                if priv8Item[0] in {0, 1}:
                    self.assertEqual(priv8Item[2], item.generation)
                elif priv8Item[0] == 2:
                    self.assertEqual(item.generation, 0)

                actualItems.append(priv8Item)

            r.close()
            actualItems = sorted(actualItems)
            expItems = sorted(expItems)

            self.assertListEqual(
                expItems,
                actualItems,
                "Didn't correctly read the Cross-Reference Stream",
            )

Example #12

0

Show file

    def testIsObjectFree(self):
        """
        Tests the ``PdfFileReader.isObjectFree()` method.
        """
        # TO-DO Find PDF files that feature free-entry lists. We are checking
        # isObjectFree() only against used items.
        inputFiles = (
            "jpeg.pdf",
            "Seige_of_Vicksburg_Sample_OCR.pdf",
            "SF424_page2.pdf",
        )

        for filename in inputFiles:
            filepath = join(self.localDataRoot, filename)
            r = PdfFileReader(join(TEST_DATA_ROOT, filename))
            expItems = self._parseXRefTable(filepath, (0, 1, 3))
            actualItems = list()

            for ref in r.objects(PdfFileReader.R_XTABLE, True):
                actualItems.append(
                    # This is where isObjectFree() gets invoked
                    (ref.idnum, ref.generation, r.isObjectFree(ref)))

            r.close()
            expItems = sorted(expItems)
            actualItems = sorted(actualItems)

            self.assertListEqual(expItems, actualItems)

Example #13

0

Show file

    def testAddAttachment(self):
        """
        Tests the addAttachment function for attaching a single file.

        Since the Names array in the EmbeddedFiles dictionary contains both the
        name (string) and indirect object (dictionary) for each file, we have
        to check for two entries per attached file.
        """

        _, testfile = tempfile.mkstemp()

        try:
            # Make PDF with attachment
            with PdfFileReader(join(TEST_DATA_ROOT, "jpeg.pdf")) as reader:
                with PdfFileWriter(testfile) as writer:
                    writer.appendPagesFromReader(reader)
                    with open(
                            join(  # pylint: disable=bad-continuation
                                TEST_DATA_ROOT, "attachment_small.png"),
                            "rb",  # pylint: disable=bad-continuation  # pylint: disable=bad-continuation
                    ) as attachment_stream:
                        read_data = attachment_stream.read()
                        writer.addAttachment("attachment_small.png", read_data)
                    writer.write()

            # Check for attachment entries
            with PdfFileReader(testfile) as pdf:
                # For caching _cachedObjects data
                pdf.numPages  # pylint: disable=pointless-statement
                for _k, v in pdf._cachedObjects.items():
                    if "/Type" in v:
                        if v["/Type"] == "/Catalog":
                            self.assertIsNotNone(v["/Names"]["/EmbeddedFiles"])
                            real = len(v["/Names"]["/EmbeddedFiles"]["/Names"])
                            self.assertEqual(2, real)
        finally:
            os.remove(testfile)

Example #14

0

Show file

    def testContextManager(self):
        """
        Tests the context manager implementation (the ``with <expr> as
        identifier`` feature) of ``PdfFileReader``.
        """
        inputFiles = ("jpeg.pdf", "Seige_of_Vicksburg_Sample_OCR.pdf",
                      "SF424_page2.pdf")

        for filename in inputFiles:
            r = None

            with PdfFileReader(join(TEST_DATA_ROOT, filename)) as r:
                # Test assertions not strictly related to the whole test case
                self.assertEqual(filename, basename(r.filepath))
                self.assertFalse(r.isClosed)

            self.assertTrue(r.isClosed)

Example #15

0

Show file

def main():
    pagesRequired = 5
    output = "PyPDF-Features-Output.pdf"

    if set(argv) & FLAG_HELP:
        print(USAGE)
        exit(0)
    elif len(argv) < 2:
        print(USAGE)
        exit(1)
    else:
        inputpath = argv[1].strip()
        filename = basename(inputpath)

        if len(argv) > 2:
            output = argv[2].strip()

    # We can instantiate a PdfFileReader/Writer by giving in a stream object
    # or a path string
    reader = PdfFileReader(open(inputpath, "rb"))
    writer = PdfFileWriter(output)

    # Check that the PDF file has the required number of pages
    if reader.numPages < pagesRequired:
        print(
            "We require a document with %d pages at least, %s has %d"
            % (pagesRequired, filename, reader.numPages),
            file=stderr,
        )
        exit(1)
    else:
        print("'%s' has %d pages... OK" % (filename, reader.numPages))

    # Add page 1 from reader to output document, unchanged
    writer.addPage(reader.getPage(0))

    # Add page 2 from reader, but rotated clockwise 90 degrees
    writer.addPage(reader.getPage(1).rotateClockwise(90))

    # Add page 3 from reader, rotated the other way:
    writer.addPage(reader.getPage(2).rotateCounterClockwise(90))
    # Alt.: writer.addPage(reader.getPage(2).rotateClockwise(270))

    # Add page 4 from reader, but first add a watermark from another PDF:
    page4 = reader.getPage(3)
    watermark = PdfFileReader(open(join(SAMPLE_PDF_ROOT, "AutoCad_Diagram.pdf"), "rb"))
    page4.mergePage(watermark.getPage(0))
    writer.addPage(page4)

    # Add page 5 from reader, but crop it to half size:
    page5 = reader.getPage(4)
    page5.mediaBox.upperRight = (
        page5.mediaBox.getUpperRight_x() / 2,
        page5.mediaBox.getUpperRight_y() / 2,
    )
    writer.addPage(page5)

    # Add some Javascript to launch the print window on opening this PDF.
    # The password dialog may prevent the print dialog from being shown.
    # Comment the encrypted lines, if that's the case, to try this out
    writer.addJS("this.print({bUI:true,bSilent:false,bShrinkToFit:true});")

    # Encrypt your new PDF and add a password
    password = "******"
    writer.encrypt(password)

    # Finally, write the resulting PDF document to ``output``
    writer.write()

    print("Output successfully written to", output)

    reader.close()
    writer.close()

Example #16

0

Show file

 def setUp(self):
     reader = PdfFileReader(join(TEST_DATA_ROOT, 'crazyones.pdf'))
     self.writer = PdfFileWriter(BytesIO(b""))
     self.writer.appendPagesFromReader(reader)

Example #17

0

Show file

 def setUp(self):
     ipdf = PdfFileReader(join(TEST_DATA_ROOT, 'crazyones.pdf'))
     self.pdfFileWriter = PdfFileWriter()
     self.pdfFileWriter.appendPagesFromReader(ipdf)

Example #18

0

Show file

 def setUp(self):
     """ [EXPLAIN THIS CONVENIENCE.] """
     reader = PdfFileReader(join(TEST_DATA_ROOT, "crazyones.pdf"))
     self.writer = PdfFileWriter(BytesIO(b""))
     self.writer.appendPagesFromReader(reader)