Example #1
0
def printMeta(filename):
    pdfFile = PdfFileReader(open(filename, 'rb'))
    docInfo = pdfFile.getDocumentInfo()
    print('[*] PDF MetaData For: {}'.format(filename))
    for metaItem in docInfo:
        print('[+] {0} : {1}'.format(metaItem, docInfo[metaItem]))
Example #2
0
def readPDF(inFileName, outFileName):
    try:
        PDFInputName = inFileName
        PDFOutputName = outFileName
        PDFInterimName = "output.pdf"

        ## Put a white ractangle on page 1
        # Create a a borderless white rectangle
        packet = io.BytesIO()
        can = canvas.Canvas(packet, pagesize=letter)
        can.setFillColorRGB(255, 255, 255)
        can.rect(450, 550, 100, 40, fill=1, stroke=0)
        can.save()

        # set to beginning of bytestream and create a new PDF
        packet.seek(0)
        newPdf = PdfFileReader(packet)
        interimOutput = PdfFileWriter()

        with open(PDFInputName, 'rb') as fileStream:
            existingPdf = PdfFileReader(fileStream)

            # Get first Page and merge with rectangle
            page = existingPdf.getPage(0)
            page.mergePage(newPdf.getPage(0))
            numPages = existingPdf.getNumPages()
            for n in range(numPages):
                interimOutput.addPage(existingPdf.getPage(n))

            with open(PDFInterimName, "wb") as fileStream:
                interimOutput.write(fileStream)

        pdfWriter = PdfFileWriter()

        with open(PDFInterimName, 'rb') as fileHandle:
            # Read & extract Information
            pdfReader = PdfFileReader(fileHandle)
            pdfInfo = pdfReader.getDocumentInfo()
            numPages = pdfReader.getNumPages()

            PDFAllInfo = {
                'author': pdfInfo.author,
                'creator': pdfInfo.creator,
                'producer': pdfInfo.producer,
                'subject': pdfInfo.subject,
                'title': pdfInfo.title,
                'num_pages': numPages
            }
            # Get content in all pages
            for pageNum in range(numPages):
                # Get page
                pageObject = pdfReader.getPage(pageNum)

                # Get only the /Contents item in dictionary Eg : [IndirectObject(4, 0)]
                pageContentsObject = pageObject['/Contents']

                # Extract the elements fo the /Contents object as a contentstream (cant print this directly)
                pageContent = ContentStream(
                    pageContentsObject,
                    pdfReader)  # Check operands and operators

                # Add page to pdf writer
                pdfWriter.addPage(pageObject)

                # loop through operators and operands in contents
                for operands, operator in pageContent.operations:
                    if operator == b_("Tj") and operands == [
                            b'\x00C\x00O\x00P\x00Y\x00-\x00O\x00N\x00L\x00Y'
                    ]:
                        operands[0] = TextStringObject('')

                # Replace /Contents in pageObject
                pageObject.__setitem__(NameObject('/Contents'), pageContent)

            # Write to output file
            with open(PDFOutputName, "wb") as outStream:
                pdfWriter.write(outStream)

        return (f"{PDFOutputName}")

    except Exception as e:
        exc_type, exc_obj, exc_tb = sys.exc_info()
        fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
        print(f"{exc_type}, {exc_obj} , {fname} : {exc_tb.tb_lineno}")